algorembrant
/

GRPO_chessagent

Model card Files Files and versions

xet

Community

algorembrant commited on 18 days ago

Commit

73d8b96

verified ·

1 Parent(s): 634251f

Upload model_aggressive.py

Browse files

Files changed (1) hide show

model_aggressive.py +769 -0

model_aggressive.py ADDED Viewed

	@@ -0,0 +1,769 @@

+# NOTE FOR COLAB USERS: Run in a separate cell first:
+# !pip -q install chess numpy torch matplotlib pandas
+"""
+Aggressive GRPO Chess Agent — T4/Colab Optimized
+"""
+import os, sys, csv, time, math, shutil, argparse, random
+import numpy as np
+import pandas as pd
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+try:
+    import chess
+except ImportError:
+    os.system("pip install -q chess")
+    import chess
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# ── Hardware flags ─────────────────────────────────────────────────────────────
+torch.backends.cudnn.benchmark = True
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+if hasattr(torch, 'set_float32_matmul_precision'):
+    torch.set_float32_matmul_precision('high')
+# ── Constants ──────────────────────────────────────────────────────────────────
+PIECE_VAL = {
+    chess.PAWN: 1.0, chess.KNIGHT: 3.0, chess.BISHOP: 3.2,
+    chess.ROOK: 5.0, chess.QUEEN: 9.0,  chess.KING: 0.0,
+}
+RANDOM_BASELINE_ELO = 800  # estimated ELO of uniform-random player
+CONFIG = {
+    "num_envs":           256,
+    "grpo_group_size":    8,      # G envs per group, all start from same opening position
+    "ppo_epochs":         3,
+    "mini_batch_size":    4096,
+    "learning_rate":      2e-4,
+    "weight_decay":       1e-4,
+    "gamma":              0.98,   # lower → discount future more → prefer fast wins
+    "clip_epsilon":       0.15,
+    "entropy_coef":       0.02,   # low → exploit aggressive lines
+    "value_coef":         0.5,
+    "max_steps":          100,
+    "opening_max_moves":  10,     # randomize opening for GRPO diversity
+    "checkpoint_dir":     "./checkpoints",
+    "save_interval":      50,
+    "log_interval":       1,
+    "elo_eval_interval":  100,    # evaluate ELO every N iterations
+    "elo_eval_games":     32,
+    "max_runtime_hours":  4.5,    # auto-save + download before Colab kills session
+    "device":             "cuda" if torch.cuda.is_available() else "cpu",
+    "seed":               42,
+}
+# ── Action Space ───────────────────────────────────────────────────────────────
+class ActionMapper:
+    __slots__ = ['move_to_idx', 'idx_to_move', 'num_actions']
+    def __init__(self):
+        self.move_to_idx: dict[str, int] = {}
+        self.idx_to_move: list[str] = []
+        idx = 0
+        for f in range(64):
+            for t in range(64):
+                if f == t: continue
+                uci = chess.SQUARE_NAMES[f] + chess.SQUARE_NAMES[t]
+                self.move_to_idx[uci] = idx
+                self.idx_to_move.append(uci)
+                idx += 1
+                if chess.square_rank(f) in (1, 6) and \
+                   abs(chess.square_file(f) - chess.square_file(t)) <= 1:
+                    for promo in "nbrq":
+                        puci = uci + promo
+                        self.move_to_idx[puci] = idx
+                        self.idx_to_move.append(puci)
+                        idx += 1
+        self.num_actions = idx
+ACTION_MAPPER = ActionMapper()
+# ── Board Encoding ─────────────────────────────────────────────────────────────
+def populate_states_fast(envs: list, active_mask: np.ndarray,
+                          bbs_np: np.ndarray, meta_np: np.ndarray) -> None:
+    """Fill bbs_np [B,12] int64 and meta_np [B,3] float32 for active envs."""
+    for b in range(len(envs)):
+        if not active_mask[b]: continue
+        env = envs[b]
+        w  = env.occupied_co[chess.WHITE]
+        bc = env.occupied_co[chess.BLACK]
+        bbs_np[b, 0]  = env.pawns   & w;  bbs_np[b, 1]  = env.knights & w
+        bbs_np[b, 2]  = env.bishops & w;  bbs_np[b, 3]  = env.rooks   & w
+        bbs_np[b, 4]  = env.queens  & w;  bbs_np[b, 5]  = env.kings   & w
+        bbs_np[b, 6]  = env.pawns   & bc; bbs_np[b, 7]  = env.knights & bc
+        bbs_np[b, 8]  = env.bishops & bc; bbs_np[b, 9]  = env.rooks   & bc
+        bbs_np[b, 10] = env.queens  & bc; bbs_np[b, 11] = env.kings   & bc
+        meta_np[b, 0] = 1.0 if env.turn else -1.0
+        meta_np[b, 1] = float(env.castling_rights) / 15.0  # [0,1]
+        meta_np[b, 2] = 1.0 if env.ep_square is not None else 0.0
+def get_legal_masks(envs: list, active_mask: np.ndarray):
+    masks      = np.zeros((len(envs), ACTION_MAPPER.num_actions), dtype=np.bool_)
+    moves_list = [None] * len(envs)
+    for b in range(len(envs)):
+        if not active_mask[b]: continue
+        legal = list(envs[b].legal_moves)
+        moves_list[b] = legal
+        for m in legal:
+            masks[b, ACTION_MAPPER.move_to_idx[m.uci()]] = True
+    return masks, moves_list
+# ── Neural Network ─────────────────────────────────────────────────────────────
+class ChessNet(nn.Module):
+    def __init__(self, res_blocks: int = 8, channels: int = 128):
+        super().__init__()
+        self.conv_in = nn.Conv2d(14, channels, 3, padding=1, bias=False)
+        self.bn_in   = nn.BatchNorm2d(channels)
+        self.res_blocks = nn.ModuleList([
+            nn.Sequential(
+                nn.Conv2d(channels, channels, 3, padding=1, bias=False),
+                nn.BatchNorm2d(channels), nn.ReLU(inplace=True),
+                nn.Conv2d(channels, channels, 3, padding=1, bias=False),
+                nn.BatchNorm2d(channels),
+            ) for _ in range(res_blocks)
+        ])
+        self.policy_head = nn.Sequential(
+            nn.Conv2d(channels, 32, 1, bias=False), nn.BatchNorm2d(32),
+            nn.ReLU(inplace=True), nn.Flatten(),
+            nn.Linear(32 * 64, ACTION_MAPPER.num_actions),
+        )
+        # No Tanh — shaped rewards exceed [-1,1]; unbounded linear output
+        self.value_head = nn.Sequential(
+            nn.Conv2d(channels, 32, 1, bias=False), nn.BatchNorm2d(32),
+            nn.ReLU(inplace=True), nn.Flatten(),
+            nn.Linear(32 * 64, 256), nn.ReLU(inplace=True),
+            nn.Linear(256, 1),
+        )
+    def forward(self, x):
+        x = F.relu(self.bn_in(self.conv_in(x)), inplace=True)
+        for blk in self.res_blocks:
+            x = F.relu(x + blk(x), inplace=True)
+        return self.policy_head(x), self.value_head(x)
+# ── ELO Tracker ───────────────────────────────────────────────────────────────
+class ELOTracker:
+    def __init__(self, initial_elo: float = 1200.0, K: float = 32.0):
+        self.elo = initial_elo
+        self.K   = K
+    def expected(self, opp_elo: float) -> float:
+        return 1.0 / (1.0 + 10.0 ** ((opp_elo - self.elo) / 400.0))
+    def update(self, score: float, opp_elo: float) -> None:
+        self.elo += self.K * (score - self.expected(opp_elo))
+# ── Opening Position Generator ─────────────────────────────────────────────────
+def get_opening_position(max_moves: int = 10) -> chess.Board:
+    """Play 0..max_moves random half-moves from start for GRPO diversity."""
+    board = chess.Board()
+    for _ in range(random.randint(0, max_moves)):
+        if board.is_game_over(): break
+        board.push(random.choice(list(board.legal_moves)))
+    return chess.Board(board.fen())  # detached copy
+# ── Auto-download ──────────────────────────────────────────────────────────────
+def auto_download(checkpoint_dir: str) -> None:
+    """Sync to Google Drive if mounted, else trigger browser downloads."""
+    try:
+        from google.colab import files as _cf
+        drive_dst = '/content/drive/MyDrive/chess_agent'
+        if os.path.exists('/content/drive/MyDrive'):
+            os.makedirs(drive_dst, exist_ok=True)
+            shutil.copytree(checkpoint_dir, drive_dst, dirs_exist_ok=True)
+            print(f"[AutoSave] Synced → {drive_dst}")
+        else:
+            for fname in ['best.pt', 'latest.pt', 'training_log.csv',
+                          'elo_log.csv', 'training_performance.png']:
+                fpath = os.path.join(checkpoint_dir, fname)
+                if os.path.exists(fpath):
+                    _cf.download(fpath)
+                    print(f"[AutoSave] Downloaded {fname}")
+    except Exception as e:
+        print(f"[AutoSave] {e}")
+# ── GRPO Trainer ───────────────────────────────────────────────────────────────
+class GRPOTrainer:
+    def __init__(self):
+        self.device = CONFIG["device"]
+        _model = ChessNet(res_blocks=8, channels=128)
+        _model = _model.to(self.device).to(memory_format=torch.channels_last)
+        try:
+            print("Compiling model (reduce-overhead)…")
+            self.model = torch.compile(_model, mode="reduce-overhead")
+        except Exception:
+            self.model = _model
+        self.optimizer = torch.optim.AdamW(
+            self.model.parameters(),
+            lr=CONFIG["learning_rate"],
+            weight_decay=CONFIG["weight_decay"],
+            fused=torch.cuda.is_available(),
+        )
+        self.scaler        = torch.amp.GradScaler('cuda')
+        self.start_iter    = 0
+        self.best_win_rate = 0.0
+        self.elo_tracker   = ELOTracker()
+        # Shared shift tensor for bit-unpacking (avoid repeated allocation)
+        self.shifts = torch.arange(64, dtype=torch.int64,
+                                   device=self.device).view(1, 1, 64)
+        os.makedirs(CONFIG["checkpoint_dir"], exist_ok=True)
+        self.log_file     = os.path.join(CONFIG["checkpoint_dir"], "training_log.csv")
+        self.elo_log_file = os.path.join(CONFIG["checkpoint_dir"], "elo_log.csv")
+        if not os.path.exists(self.log_file):
+            with open(self.log_file, "w", newline="") as f:
+                csv.writer(f).writerow([
+                    "iteration", "p_loss", "v_loss", "v_mean", "fps",
+                    "win_rate", "draw_rate", "check_rate", "capture_rate", "avg_game_len",
+                ])
+        if not os.path.exists(self.elo_log_file):
+            with open(self.elo_log_file, "w", newline="") as f:
+                csv.writer(f).writerow(
+                    ["iteration", "elo", "eval_wins", "eval_draws", "eval_losses"])
+        self._init_checkpointing()
+    # ── Checkpointing ──────────────────────────────────────────────────────────
+    def _init_checkpointing(self) -> None:
+        latest = os.path.join(CONFIG["checkpoint_dir"], "latest.pt")
+        if not os.path.exists(latest):
+            return
+        try:
+            ckpt = torch.load(latest, map_location=self.device, weights_only=False)
+            sd   = ckpt['model_state_dict']
+            # Handle compiled (_orig_mod. prefix) vs uncompiled state dicts
+            loaded = False
+            for attempt in [
+                sd,
+                {k.replace('_orig_mod.', ''): v for k, v in sd.items()},
+                {'_orig_mod.' + k: v for k, v in sd.items()},
+            ]:
+                try:
+                    self.model.load_state_dict(attempt); loaded = True; break
+                except RuntimeError:
+                    continue
+            if not loaded:
+                raise RuntimeError("All state dict key variants failed.")
+            self.optimizer.load_state_dict(ckpt['optimizer_state_dict'])
+            self.scaler.load_state_dict(ckpt['scaler_state_dict'])
+            self.start_iter        = ckpt.get('iteration', 0) + 1
+            self.elo_tracker.elo   = ckpt.get('elo', 1200.0)
+            self.best_win_rate     = ckpt.get('best_win_rate', 0.0)
+            print(f"Resumed from iter {self.start_iter} | "
+                  f"ELO {self.elo_tracker.elo:.0f} | best_win {self.best_win_rate:.3f}")
+        except Exception as e:
+            print(f"Checkpoint load failed ({e}). Starting fresh.")
+    def save_checkpoint(self, iteration: int, is_best: bool = False) -> None:
+        ckpt = {
+            'iteration':            iteration,
+            'model_state_dict':     self.model.state_dict(),
+            'optimizer_state_dict': self.optimizer.state_dict(),
+            'scaler_state_dict':    self.scaler.state_dict(),
+            'elo':                  self.elo_tracker.elo,
+            'best_win_rate':        self.best_win_rate,
+            'config':               CONFIG,
+        }
+        cdir  = CONFIG["checkpoint_dir"]
+        path  = os.path.join(cdir, f"iter_{iteration:04d}.pt")
+        # Atomic write: write to .tmp then os.replace (single syscall, crash-safe)
+        torch.save(ckpt, path + ".tmp");  os.replace(path + ".tmp", path)
+        latest = os.path.join(cdir, "latest.pt")
+        shutil.copy2(path, latest + ".tmp"); os.replace(latest + ".tmp", latest)
+        if is_best:
+            best = os.path.join(cdir, "best.pt")
+            shutil.copy2(path, best + ".tmp"); os.replace(best + ".tmp", best)
+    # ── ELO Evaluation (batched, greedy) ──────────────────────────────────────
+    def _elo_game_done(self, board: chess.Board, idx: int, agent_color,
+                        scores: np.ndarray, active: np.ndarray) -> None:
+        if board.is_game_over():
+            res = board.result()
+            if (res == "1-0" and agent_color == chess.WHITE) or \
+               (res == "0-1" and agent_color == chess.BLACK):
+                scores[idx] = 1.0
+            elif res == "1/2-1/2":
+                scores[idx] = 0.5
+            else:
+                scores[idx] = 0.0
+            active[idx] = False
+    def evaluate_elo(self, n_games: int = 32, max_ply: int = 200) -> tuple:
+        """
+        Play n_games vs random opponent (batched GPU for agent moves).
+        Half games as White, half as Black.
+        Returns (wins, draws, losses) from agent's perspective.
+        """
+        self.model.eval()
+        boards       = [chess.Board() for _ in range(n_games)]
+        agent_colors = [chess.WHITE if i % 2 == 0 else chess.BLACK
+                        for i in range(n_games)]
+        scores  = np.full(n_games, 0.5, dtype=np.float32)  # default: draw
+        active  = np.ones(n_games, dtype=bool)
+        bbs_sub = np.zeros((n_games, 12), dtype=np.int64)
+        meta_sub= np.zeros((n_games, 3),  dtype=np.float32)
+        for _ in range(max_ply):
+            if not active.any(): break
+            # Random moves (opponent turns) — CPU
+            for i in [i for i in range(n_games)
+                      if active[i] and boards[i].turn != agent_colors[i]]:
+                legal = list(boards[i].legal_moves)
+                if legal: boards[i].push(random.choice(legal))
+                self._elo_game_done(boards[i], i, agent_colors[i], scores, active)
+            # Agent moves (batched GPU)
+            ag_idx = [i for i in range(n_games)
+                      if active[i] and boards[i].turn == agent_colors[i]]
+            if not ag_idx:
+                continue
+            n = len(ag_idx)
+            sub      = [boards[i] for i in ag_idx]
+            act_sub  = np.ones(n, dtype=bool)
+            populate_states_fast(sub, act_sub, bbs_sub[:n], meta_sub[:n])
+            bbs_t    = torch.tensor(bbs_sub[:n], dtype=torch.int64, device=self.device)
+            unpacked = ((bbs_t.unsqueeze(-1) >> self.shifts) & 1).float().view(n, 12, 8, 8)
+            state    = torch.zeros(n, 14, 8, 8, device=self.device, dtype=torch.float32)
+            state[:, :12] = unpacked
+            state[:, 12]  = torch.tensor(meta_sub[:n, 0], device=self.device).view(n, 1, 1).expand(n, 8, 8)
+            state[:, 13]  = torch.tensor(meta_sub[:n, 1], device=self.device).view(n, 1, 1).expand(n, 8, 8)
+            for lj in range(n):
+                if meta_sub[lj, 2]:
+                    state[lj, 13, 0, 1] = float(meta_sub[lj, 2])
+            with torch.no_grad(), torch.amp.autocast('cuda'):
+                logits, _ = self.model(state.to(memory_format=torch.channels_last))
+            logits = logits.float()
+            masks_np, legal_lists = get_legal_masks(sub, act_sub)
+            masks_t = torch.tensor(masks_np, dtype=torch.bool, device=self.device)
+            logits  = torch.where(masks_t, logits,
+                                  torch.tensor(-60000.0, device=self.device))
+            best_acts = logits.argmax(dim=-1).cpu().numpy()  # greedy for evaluation
+            for lj, gi in enumerate(ag_idx):
+                if not active[gi]: continue
+                move_uci = ACTION_MAPPER.idx_to_move[best_acts[lj]]
+                move     = chess.Move.from_uci(move_uci)
+                legal    = legal_lists[lj] or list(boards[gi].legal_moves)
+                if not legal:
+                    active[gi] = False; continue
+                if move not in legal:
+                    move = random.choice(legal)
+                boards[gi].push(move)
+                self._elo_game_done(boards[gi], gi, agent_colors[gi], scores, active)
+        wins   = int((scores == 1.0).sum())
+        draws  = int((scores == 0.5).sum())
+        losses = int((scores == 0.0).sum())
+        for s in scores:
+            self.elo_tracker.update(float(s), RANDOM_BASELINE_ELO)
+        return wins, draws, losses
+    # ── Main Training Loop ─────────────────────────────────────────────────────
+    def train(self, num_iterations: int) -> None:
+        B         = CONFIG["num_envs"]
+        max_steps = CONFIG["max_steps"]
+        G         = CONFIG["grpo_group_size"]
+        num_groups= B // G
+        gamma     = CONFIG["gamma"]
+        t_start   = time.time()
+        max_rt    = CONFIG["max_runtime_hours"] * 3600.0
+        # ── Preallocate GPU buffers (int8/bool minimizes VRAM footprint) ──────
+        states_buf  = torch.zeros((max_steps, B, 14, 8, 8), dtype=torch.int8,    device=self.device)
+        actions_buf = torch.zeros((max_steps, B),            dtype=torch.int16,   device=self.device)
+        logprobs_buf= torch.zeros((max_steps, B),            dtype=torch.float32, device=self.device)
+        values_buf  = torch.zeros((max_steps, B),            dtype=torch.float32, device=self.device)
+        rewards_buf = torch.zeros((max_steps, B),            dtype=torch.float32, device=self.device)
+        dones_buf   = torch.zeros((max_steps, B),            dtype=torch.bool,    device=self.device)
+        active_buf  = torch.zeros((max_steps, B),            dtype=torch.bool,    device=self.device)
+        bbs_np  = np.zeros((B, 12), dtype=np.int64)   # int64: no astype copy needed
+        meta_np = np.zeros((B, 3),  dtype=np.float32)
+        vram_gb = (torch.cuda.get_device_properties(0).total_memory / 1e9
+                   if torch.cuda.is_available() else 0.0)
+        print(f"\n🚀 Aggressive GRPO Chess Agent")
+        print(f"   Envs:{B} | Groups:{num_groups}×G:{G} | Device:{self.device.upper()} | "
+              f"VRAM:{vram_gb:.1f}GB")
+        print(f"   Reward: capture(0-0.3)+check(0.3)+checkmate_speed(1.0-1.5)"
+              f"+draw_penalty(-0.5)+time(-0.003/step)")
+        print(f"   gamma:{gamma} | entropy:{CONFIG['entropy_coef']} | "
+              f"lr:{CONFIG['learning_rate']}")
+        for iteration in range(self.start_iter, num_iterations):
+            # ── Runtime guard ──────────────────────────────────────────────
+            elapsed = time.time() - t_start
+            if elapsed > max_rt:
+                print(f"\n⏱  {elapsed/3600:.2f}h reached. Saving & downloading…")
+                self.save_checkpoint(iteration)
+                self.plot_metrics()
+                auto_download(CONFIG["checkpoint_dir"])
+                break
+            iter_start = time.time()
+            # Zero buffers in-place (no reallocation)
+            states_buf.zero_();  actions_buf.zero_();  logprobs_buf.zero_()
+            values_buf.zero_();  rewards_buf.zero_()
+            dones_buf.fill_(False); active_buf.fill_(False)
+            # ── GRPO: each group of G envs shares an opening position ──────
+            fens = [get_opening_position(CONFIG["opening_max_moves"]).fen()
+                    for _ in range(num_groups)]
+            envs: list[chess.Board] = []
+            for gi in range(num_groups):
+                for _ in range(G):
+                    envs.append(chess.Board(fens[gi]))
+            active       = np.ones(B, dtype=bool)
+            game_lengths = np.zeros(B, dtype=np.int32)
+            # Per-iteration attack metrics
+            white_wins = black_wins = draws_count = 0
+            total_checks = total_captures = 0
+            # ── PHASE 1: ROLLOUT ───────────────────────────────────────────
+            for t in range(max_steps):
+                if not active.any(): break
+                populate_states_fast(envs, active, bbs_np, meta_np)
+                # Bit-unpack bitboards → int8 state tensor (no float copy)
+                bbs_t    = torch.as_tensor(bbs_np, dtype=torch.int64, device=self.device)
+                unpacked = ((bbs_t.unsqueeze(-1) >> self.shifts) & 1).to(torch.int8)
+                meta_t   = torch.as_tensor(meta_np, dtype=torch.float32, device=self.device)
+                # Pack into int8 buffer (scale float meta to [-127,127])
+                states_buf[t, :, :12, :, :] = unpacked.view(B, 12, 8, 8)
+                states_buf[t, :, 12,  :, :] = (meta_t[:, 0] * 127).clamp(-127, 127) \
+                                               .to(torch.int8).view(B, 1, 1).expand(B, 8, 8)
+                states_buf[t, :, 13,  :, :] = (meta_t[:, 1] * 127).clamp(0, 127) \
+                                               .to(torch.int8).view(B, 1, 1).expand(B, 8, 8)
+                states_buf[t, :, 13,  0,  1]= (meta_t[:, 2] * 127).clamp(0, 127).to(torch.int8)
+                active_buf[t] = torch.as_tensor(active, dtype=torch.bool, device=self.device)
+                # Normalize int8→float32 for forward pass
+                model_input = states_buf[t].to(
+                    dtype=torch.float32, memory_format=torch.channels_last) / 127.0
+                self.model.eval()
+                with torch.no_grad(), torch.amp.autocast('cuda'):
+                    logits, values = self.model(model_input)
+                masks_np, legal_moves_list = get_legal_masks(envs, active)
+                masks_t = torch.as_tensor(masks_np, dtype=torch.bool, device=self.device)
+                logits  = logits.float()
+                logits  = torch.where(masks_t, logits,
+                                      torch.tensor(-60000.0, device=self.device))
+                no_legal = ~masks_t.any(dim=-1, keepdim=True)
+                logits.masked_fill_(no_legal, 0.0)
+                probs   = F.softmax(logits, dim=-1)
+                dist    = torch.distributions.Categorical(probs)
+                actions = dist.sample()
+                actions_buf[t]  = actions.to(torch.int16)
+                logprobs_buf[t] = dist.log_prob(actions)
+                values_buf[t]   = values.squeeze(-1)
+                actions_cpu = actions.cpu().numpy()
+                for b in range(B):
+                    if not active[b]: continue
+                    move_uci = ACTION_MAPPER.idx_to_move[actions_cpu[b]]
+                    move     = chess.Move.from_uci(move_uci)
+                    if move not in legal_moves_list[b]:
+                        move = random.choice(legal_moves_list[b])
+                    board           = envs[b]
+                    mover_is_white  = (board.turn == chess.WHITE)
+                    sign            = 1.0 if mover_is_white else -1.0
+                    # ── Reward: pre-push components ─────────────────────
+                    r = -0.003 * sign  # time penalty (per-mover, white-perspective)
+                    if board.is_capture(move):
+                        if board.is_en_passant(move):
+                            cap_val = 1.0
+                        else:
+                            cp      = board.piece_at(move.to_square)
+                            cap_val = PIECE_VAL.get(cp.piece_type, 0.0) if cp else 0.0
+                        r += sign * (cap_val / 9.0) * 0.3  # [0, 0.3]
+                        total_captures += 1
+                    if move.promotion in (chess.QUEEN, chess.ROOK):
+                        r += sign * 0.15  # aggressive promotion
+                    board.push(move)
+                    game_lengths[b] += 1
+                    # ── Reward: post-push components ────────────────────
+                    if board.is_check():
+                        r += sign * 0.3  # gave check
+                        total_checks += 1
+                    if board.is_game_over():
+                        if board.is_checkmate():
+                            # Mover delivered checkmate
+                            speed_bonus = 0.5 * math.exp(-game_lengths[b] / 20.0)
+                            r += sign * (1.0 + speed_bonus)  # ~1.0-1.5
+                            if mover_is_white: white_wins += 1
+                            else:              black_wins += 1
+                        else:
+                            # Draw (stalemate / 50-move / repetition / insufficient material)
+                            r -= 0.5  # flat penalty from white's perspective — attack to WIN
+                            draws_count += 1
+                        dones_buf[t, b] = True
+                        active[b]       = False
+                    rewards_buf[t, b] = r
+                # end per-env loop
+            # end rollout
+            # ── PHASE 2: VECTORIZED RETURNS ────────────────────────────────
+            returns     = torch.zeros(B, dtype=torch.float32, device=self.device)
+            returns_buf = torch.zeros((max_steps, B), dtype=torch.float32, device=self.device)
+            not_done_f  = (~dones_buf).float()
+            for step in reversed(range(max_steps)):
+                returns          = rewards_buf[step] + gamma * returns * not_done_f[step]
+                returns_buf[step]= returns
+            # ── PHASE 3: GRPO GROUP-WISE ADVANTAGE NORMALIZATION ───────────
+            # advantages shape [max_steps, B]
+            adv_raw  = returns_buf - values_buf
+            active_f = active_buf.float()
+            # Reshape to [max_steps, num_groups, G] and normalize within each group
+            adv_3d    = adv_raw.view(max_steps, num_groups, G)
+            act_3d    = active_f.view(max_steps, num_groups, G)
+            g_count   = act_3d.sum(dim=[0, 2]).clamp(min=1.0)           # [num_groups]
+            g_mean    = (adv_3d * act_3d).sum(dim=[0, 2]) / g_count     # [num_groups]
+            g_sq_diff = ((adv_3d - g_mean.view(1, num_groups, 1)) ** 2
+                         * act_3d).sum(dim=[0, 2])
+            g_std     = (g_sq_diff / g_count).sqrt().clamp(min=1e-8)    # [num_groups]
+            adv_3d    = (adv_3d - g_mean.view(1, num_groups, 1)) / \
+                         g_std.view(1, num_groups, 1)
+            adv_norm  = adv_3d.view(max_steps, B)
+            # Flatten, filter to active steps only
+            valid_mask      = active_buf.view(-1)
+            flat_states     = (states_buf.view(-1, 14, 8, 8)[valid_mask]
+                               .to(torch.float32, memory_format=torch.channels_last)
+                               .div_(127.0))
+            flat_actions    = actions_buf.view(-1)[valid_mask].to(torch.int64)
+            flat_old_lp     = logprobs_buf.view(-1)[valid_mask]
+            flat_returns    = returns_buf.view(-1)[valid_mask]
+            flat_advantages = adv_norm.view(-1)[valid_mask]
+            dataset_size = flat_states.size(0)
+            if dataset_size < 100:
+                continue  # skip degenerate rollout (all games ended instantly)
+            # ── PHASE 4: PPO OPTIMIZATION ──────────────────────────────────
+            self.model.train()
+            total_p_loss = total_v_loss = 0.0
+            num_updates  = 0
+            mb_size      = CONFIG["mini_batch_size"]
+            for _ in range(CONFIG["ppo_epochs"]):
+                perm = torch.randperm(dataset_size, device=self.device)
+                for start in range(0, dataset_size, mb_size):
+                    mb = perm[start: start + mb_size]
+                    with torch.amp.autocast('cuda'):
+                        new_logits, new_vals = self.model(flat_states[mb])
+                        new_dist   = torch.distributions.Categorical(logits=new_logits)
+                        new_lp     = new_dist.log_prob(flat_actions[mb])
+                        ratio      = torch.exp(new_lp - flat_old_lp[mb])
+                        adv        = flat_advantages[mb]
+                        surr1      = ratio * adv
+                        surr2      = torch.clamp(
+                            ratio,
+                            1.0 - CONFIG["clip_epsilon"],
+                            1.0 + CONFIG["clip_epsilon"],
+                        ) * adv
+                        p_loss     = -torch.min(surr1, surr2).mean()
+                        v_loss     = F.mse_loss(new_vals.squeeze(-1), flat_returns[mb])
+                        entropy    = new_dist.entropy().mean()
+                        loss       = (p_loss
+                                      + CONFIG["value_coef"]  * v_loss
+                                      - CONFIG["entropy_coef"] * entropy)
+                    self.optimizer.zero_grad(set_to_none=True)
+                    self.scaler.scale(loss).backward()
+                    self.scaler.unscale_(self.optimizer)
+                    nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
+                    self.scaler.step(self.optimizer)
+                    self.scaler.update()
+                    total_p_loss += p_loss.item()
+                    total_v_loss += v_loss.item()
+                    num_updates  += 1
+            # ── PHASE 5: METRICS & LOGGING ────────────────────────────────
+            done_count   = white_wins + black_wins + draws_count
+            win_rate     = white_wins   / max(done_count, 1)
+            draw_rate    = draws_count  / max(done_count, 1)
+            active_steps = int(active_buf.sum().item())
+            check_rate   = total_checks   / max(active_steps, 1)
+            capture_rate = total_captures / max(active_steps, 1)
+            avg_game_len = float(game_lengths.mean())
+            fps          = dataset_size / max(time.time() - iter_start, 1e-3)
+            if (iteration + 1) % CONFIG["log_interval"] == 0:
+                vram_alloc = (torch.cuda.memory_allocated() / 1e9
+                              if torch.cuda.is_available() else 0.0)
+                vram_res   = (torch.cuda.memory_reserved()  / 1e9
+                              if torch.cuda.is_available() else 0.0)
+                print(
+                    f"[{iteration+1:05d}] "
+                    f"P:{total_p_loss/max(1,num_updates):.4f} "
+                    f"V:{total_v_loss/max(1,num_updates):.4f} | "
+                    f"W:{win_rate:.3f} D:{draw_rate:.3f} "
+                    f"Chk:{check_rate:.4f} Cap:{capture_rate:.4f} "
+                    f"Len:{avg_game_len:.1f} | "
+                    f"ELO:{self.elo_tracker.elo:.0f} | "
+                    f"FPS:{fps:.0f} | "
+                    f"VRAM:{vram_alloc:.2f}/{vram_res:.2f}GB"
+                )
+                with open(self.log_file, "a", newline="") as f:
+                    csv.writer(f).writerow([
+                        iteration + 1,
+                        total_p_loss / max(1, num_updates),
+                        total_v_loss / max(1, num_updates),
+                        flat_returns.mean().item(),
+                        fps, win_rate, draw_rate,
+                        check_rate, capture_rate, avg_game_len,
+                    ])
+            # Save best checkpoint when win_rate improves
+            if win_rate > self.best_win_rate:
+                self.best_win_rate = win_rate
+                self.save_checkpoint(iteration + 1, is_best=True)
+            if (iteration + 1) % CONFIG["save_interval"] == 0:
+                self.save_checkpoint(iteration + 1)
+                self.plot_metrics()
+            # ELO evaluation
+            if (iteration + 1) % CONFIG["elo_eval_interval"] == 0:
+                elo_before = self.elo_tracker.elo
+                ew, ed, el = self.evaluate_elo(CONFIG["elo_eval_games"])
+                print(
+                    f"  [ELO eval] {elo_before:.0f} → {self.elo_tracker.elo:.0f} | "
+                    f"W:{ew} D:{ed} L:{el} vs random({RANDOM_BASELINE_ELO})"
+                )
+                with open(self.elo_log_file, "a", newline="") as f:
+                    csv.writer(f).writerow(
+                        [iteration + 1, self.elo_tracker.elo, ew, ed, el])
+                self.plot_metrics()
+            # Aggressive cache reclaim (free fragmented blocks, not pinned allocs)
+            torch.cuda.empty_cache()
+    # ── Plotting ───────────────────────────────────────────────────────────────
+    def plot_metrics(self) -> None:
+        if not os.path.exists(self.log_file): return
+        df = pd.read_csv(self.log_file)
+        if len(df) < 2: return
+        elo_df = None
+        if os.path.exists(self.elo_log_file):
+            elo_df = pd.read_csv(self.elo_log_file)
+        fig, axs = plt.subplots(3, 2, figsize=(14, 12))
+        fig.suptitle("Aggressive GRPO Chess Agent — Training Dashboard", fontsize=14)
+        # Row 0: Losses
+        axs[0, 0].plot(df['iteration'], df['p_loss'], color='steelblue', linewidth=1.2)
+        axs[0, 0].set_title('Policy Loss'); axs[0, 0].set_xlabel('Iteration')
+        axs[0, 1].plot(df['iteration'], df['v_loss'], color='tomato', linewidth=1.2)
+        axs[0, 1].set_title('Value Loss'); axs[0, 1].set_xlabel('Iteration')
+        # Row 1: Outcomes
+        axs[1, 0].plot(df['iteration'], df['win_rate'],  label='Win',  color='green')
+        axs[1, 0].plot(df['iteration'], df['draw_rate'], label='Draw', color='orange')
+        axs[1, 0].set_title('Outcomes (White perspective)')
+        axs[1, 0].legend(); axs[1, 0].set_xlabel('Iteration')
+        # Row 1: Attack metrics
+        axs[1, 1].plot(df['iteration'], df['check_rate'],   label='Check/step',   color='purple')
+        axs[1, 1].plot(df['iteration'], df['capture_rate'], label='Capture/step', color='darkorange')
+        axs[1, 1].set_title('Attack Metrics (↑ = more aggressive)')
+        axs[1, 1].legend(); axs[1, 1].set_xlabel('Iteration')
+        # Row 2: ELO Rating
+        if elo_df is not None and len(elo_df) > 0:
+            axs[2, 0].plot(elo_df['iteration'], elo_df['elo'],
+                           color='gold', linewidth=2.0, label='Agent ELO')
+            axs[2, 0].axhline(RANDOM_BASELINE_ELO, linestyle='--',
+                              color='gray', alpha=0.8, label=f'Random ({RANDOM_BASELINE_ELO})')
+            axs[2, 0].axhline(1200, linestyle=':', color='lightblue',
+                              alpha=0.6, label='Start (1200)')
+            axs[2, 0].fill_between(elo_df['iteration'], RANDOM_BASELINE_ELO,
+                                   elo_df['elo'], alpha=0.15, color='gold')
+            axs[2, 0].set_title('ELO Rating vs Random Baseline')
+            axs[2, 0].legend(); axs[2, 0].set_xlabel('Iteration')
+        else:
+            axs[2, 0].text(0.5, 0.5, f'ELO eval every {CONFIG["elo_eval_interval"]} iters',
+                           ha='center', va='center', transform=axs[2, 0].transAxes,
+                           color='gray', fontsize=11)
+            axs[2, 0].set_title('ELO Rating (pending)')
+        # Row 2: Average game length
+        axs[2, 1].plot(df['iteration'], df['avg_game_len'], color='teal', linewidth=1.2)
+        axs[2, 1].set_title('Avg Game Length (↓ = faster checkmates)')
+        axs[2, 1].set_xlabel('Iteration')
+        for ax in axs.flat:
+            ax.grid(True, alpha=0.25)
+        plt.tight_layout()
+        out = os.path.join(CONFIG["checkpoint_dir"], "training_performance.png")
+        plt.savefig(out, dpi=100, bbox_inches='tight')
+        plt.close(fig)
+        print(f"  [Plot] saved → {out}")
+# ── Entry Point ────────────────────────────────────────────────────────────────
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Aggressive GRPO Chess Agent (T4/Colab)")
+    parser.add_argument("--iterations", type=int, default=10000,
+                        help="Total training iterations")
+    parser.add_argument("--test-batch", action="store_true",
+                        help="Run 2 iterations for smoke-test")
+    args, _ = parser.parse_known_args()
+    torch.manual_seed(CONFIG["seed"])
+    np.random.seed(CONFIG["seed"])
+    random.seed(CONFIG["seed"])
+    # Print VRAM summary at startup
+    if torch.cuda.is_available():
+        props = torch.cuda.get_device_properties(0)
+        print(f"GPU: {props.name} | VRAM: {props.total_memory/1e9:.1f}GB | "
+              f"SM: {props.multi_processor_count} | "
+              f"Compute: {props.major}.{props.minor}")
+    trainer = GRPOTrainer()
+    trainer.train(2 if args.test_batch else args.iterations)