Spaces:

K446
/

Opengrid

Running

K446 commited on Apr 6

Commit

78131a0

0 Parent(s):

OpenGrid: Multi-agent POMDP power grid environment with GRPO training

Features:
- Multi-agent POMDP environment with safety layer and oversight agent
- Environment-grounded GRPO reward function (steps actual physics)
- FastAPI server with single/multi-agent APIs, grading, and visualization
- Heuristic baseline, LLM inference pipeline, and training notebook
- Karnataka KPTCL real-world grid task
- 4 task difficulties: easy, medium, hard, karnataka

Files changed (36) hide show

.dockerignore +31 -0
.gitattributes +5 -0
.gitignore +36 -0
Dockerfile +32 -0
LICENSE +21 -0
README.md +390 -0
app.py +416 -0
changes.md +111 -0
inference.py +535 -0
openenv.yaml +40 -0
pyproject.toml +41 -0
requirements.txt +9 -0
server/__init__.py +1 -0
server/app.py +21 -0
src/__init__.py +0 -0
src/baseline.py +199 -0
src/environment.py +672 -0
src/grader.py +232 -0
src/models.py +162 -0
src/oversight.py +190 -0
src/physics.py +172 -0
src/safety.py +316 -0
src/tasks.py +384 -0
src/visualization.py +224 -0
static/app.js +680 -0
static/index.html +225 -0
static/karnataka.svg +3 -0
static/logo.png +3 -0
static/style.css +935 -0
tests/__init__.py +0 -0
tests/test_multi_agent.py +345 -0
tests/test_solver.py +195 -0
training/__init__.py +1 -0
training/opengrid_grpo_colab.ipynb +635 -0
training/train_grpo.py +827 -0
validate-submission.sh +103 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,31 @@

+__pycache__/
+*.pyc
+.pytest_cache/
+.venv/
+venv/
+.git/
+.gitignore
+.vscode/
+.env
+# Docs (keep README for the Space)
+guide.md
+detailed judging criteria.md
+ui_skill.md
+project-spec.md
+codebase_summary.md
+pyrightconfig.json
+# Generated files
+inference_output.txt
+generate_code_md.py
+uv.lock
+# Training outputs (not needed in Docker image)
+training/outputs/
+*.safetensors
+*.bin
+# Tests not needed in production
+tests/
+test_multiagent.py

.gitattributes ADDED Viewed

	@@ -0,0 +1,5 @@

+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+*.ico filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,36 @@

+__pycache__/
+*.pyc
+*.pyo
+.pytest_cache/
+.venv/
+venv/
+*.egg-info/
+dist/
+build/
+.env
+.vscode/
+# Generated / temporary files
+inference_output.txt
+codebase_summary.md
+generate_code_md.py
+uv.lock
+# Reference docs (not part of submission)
+guide.md
+detailed judging criteria.md
+ui_skill.md
+project-spec.md
+pyrightconfig.json
+# Training outputs (large files — push separately or add to HF)
+training/outputs/
+*.safetensors
+*.bin
+# OS files
+Thumbs.db
+.DS_Store
+# Duplicate test file (tests/ directory has the real one)
+test_multiagent.py

Dockerfile ADDED Viewed

	@@ -0,0 +1,32 @@

+# Hugging Face Docker Space — OpenGrid
+# Docs: https://huggingface.co/docs/hub/spaces-sdks-docker
+FROM python:3.10-slim
+LABEL org.opencontainers.image.title="OpenGrid"
+LABEL org.opencontainers.image.description="Renewable energy grid load-balancing environment"
+LABEL openenv="true"
+# Create non-root user required by HF Spaces
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+# Install dependencies
+COPY --chown=user requirements.txt .
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+# Copy application code
+COPY --chown=user . /app
+# Expose HF Spaces default port
+EXPOSE 7860
+# Healthcheck
+HEALTHCHECK --interval=30s --timeout=5s --start-period=15s \
+    CMD python -c "import httpx; httpx.get('http://localhost:7860/health').raise_for_status()" || exit 1
+# Run the server
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 KRISHNA GOYAL
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,390 @@

+---
+title: OpenGrid
+emoji: ⚡
+colorFrom: green
+colorTo: blue
+sdk: docker
+app_file: app.py
+pinned: false
+---
+<p align="center">
+  <img src="static/logo.png" alt="OpenGrid Logo" width="120">
+</p>
+<h1 align="center">OpenGrid ⚡</h1>
+<p align="center"><strong>Safe Multi-Agent RL for Power Grid Operations</strong></p>
+<p align="center">
+  <a href="https://huggingface.co/spaces/K446/Opengrid"><img src="https://img.shields.io/badge/🤗%20Live%20Demo-HuggingFace%20Space-yellow" alt="Live Demo"></a>
+  <a href="https://github.com/krishnagoyal099/Opengrid_env"><img src="https://img.shields.io/badge/GitHub-Repository-181717?logo=github" alt="GitHub"></a>
+  <a href="https://github.com/openenv"><img src="https://img.shields.io/badge/OpenEnv-compatible-blue" alt="OpenEnv"></a>
+  <a href="https://www.python.org"><img src="https://img.shields.io/badge/python-3.10%2B-blue" alt="Python 3.10+"></a>
+  <a href="LICENSE"><img src="https://img.shields.io/badge/License-MIT-green.svg" alt="License: MIT"></a>
+</p>
+---
+## What is OpenGrid?
+OpenGrid is a **multi-agent reinforcement learning environment** where AI agents control a power grid. Multiple agents, each managing a zone, must coordinate under **partial observability** to keep the lights on — balancing electricity supply and demand in real-time while managing renewable energy volatility.
+What makes OpenGrid different:
+- **Multi-Agent POMDP**: 2-3 agents, each seeing only their local zone + noisy global signals
+- **Safety Layer**: Hard constraint filter blocks unsafe actions before they reach the physics engine (N-1 security, anti-islanding, ramp limits)
+- **Oversight Agent**: Monitors cross-zone coordination, penalizes selfish behavior
+- **Composable Rewards**: 6 independent reward functions — survival, frequency, congestion, safety compliance, coordination, efficiency
+- **Real Physics**: DC power flow solver with droop frequency model
+> **🔗 Try it live:** [huggingface.co/spaces/K446/Opengrid](https://huggingface.co/spaces/K446/Opengrid)
+---
+## How It Works
+```
+┌─────────────────────────────────────────────────────────┐
+│                   MULTI-AGENT LOOP                      │
+│                                                         │
+│   Each agent observes LOCAL zone state (POMDP)          │
+│              │                                          │
+│              ▼                                          │
+│   Each agent proposes action (adjust power, switch      │
+│   lines — only within their zone)                       │
+│              │                                          │
+│              ▼                                          │
+│   SAFETY LAYER validates all actions:                   │
+│   - N-1 security check                                 │
+│   - Anti-islanding                                     │
+│   - Projects unsafe → nearest safe alternative          │
+│              │                                          │
+│              ▼                                          │
+│   OVERSIGHT AGENT evaluates coordination:               │
+│   - Detects conflicts between agents                   │
+│   - Penalizes selfish behavior                         │
+│              │                                          │
+│              ▼                                          │
+│   Physics engine solves DC power flow                   │
+│              │                                          │
+│              ▼                                          │
+│   Per-agent rewards: local + global + safety + coord    │
+│              │                                          │
+│   Repeat for 50 steps — or until blackout!              │
+└─────────────────────────────────────────────────────────┘
+```
+The agent interacts through a **REST API** — any language or framework that can make HTTP requests can play. Both single-agent (backward compatible) and multi-agent modes are supported.
+---
+## Three Difficulty Levels
+| Task | Grid Size | Agents | Renewable Mix | What Makes It Hard |
+|---|---|---|---|---|
+| `task_easy` | 5 buses | 2 | 20% | Basic frequency control, 2-zone coordination |
+| `task_medium` | 10 buses | 3 | 50% | Volatile renewables + congestion + 3-zone POMDP |
+| `task_hard` | 14 buses | 3 | 70% | High volatility, tight margins, complex topology |
+| `task_karnataka` | 15 buses | 4 | Real mix | Real KPTCL topology (Raichur, Ballari, Bengaluru, Mysuru) with GPS coordinates |
+All tasks run for **50 timesteps**. Scores range from **0.02 to 0.98** (higher = better).
+---
+## Quick Start
+### 1. Clone & Install
+```bash
+git clone https://github.com/krishnagoyal099/Opengrid_env.git
+cd Opengrid_env
+pip install -r requirements.txt
+```
+### 2. Start the Server
+```bash
+uvicorn app:app --host 0.0.0.0 --port 7860
+```
+Then open [http://localhost:7860](http://localhost:7860) — you'll see the **interactive SCADA dashboard** with a Leaflet.js GIS map showing the Karnataka grid topology in real-time.
+### 3. Run the AI Agent
+```bash
+# Set your LLM API credentials
+export API_BASE_URL="https://api.openai.com/v1"
+export MODEL_NAME="gpt-4o"
+export HF_TOKEN="your-api-key"
+export ENV_URL="http://localhost:7860"
+# Run inference on all 3 tasks
+python inference.py
+```
+### 4. Train with GRPO
+```bash
+# Test the training pipeline (no GPU needed)
+python training/train_grpo.py --test-mode
+# Full training with Unsloth (needs GPU)
+python training/train_grpo.py --model unsloth/Qwen2.5-1.5B-Instruct-bnb-4bit --use-unsloth
+```
+### Docker (Alternative)
+```bash
+docker build -t opengrid .
+docker run -p 7860:7860 opengrid
+```
+---
+## Multi-Agent API
+### Reset in Multi-Agent Mode
+```bash
+curl -X POST "http://localhost:7860/reset_multi?task_id=task_medium"
+# Returns: {
+#   "session_id": "abc-123",
+#   "num_agents": 3,
+#   "zone_info": {"0": {"zone_name": "Bengaluru_Region", "bus_ids": [...]}, ...},
+#   "observations": {"0": {...}, "1": {...}, "2": {...}}
+# }
+```
+### Take a Multi-Agent Step
+```bash
+curl -X POST "http://localhost:7860/step_multi?session_id=abc-123" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "agent_actions": {
+      "0": {"bus_adjustments": [{"bus_id": 0, "delta": 5.0}], "topology_actions": []},
+      "1": {"bus_adjustments": [], "topology_actions": []},
+      "2": {"bus_adjustments": [{"bus_id": 9, "delta": -3.0}], "topology_actions": []}
+    }
+  }'
+# Returns: per-agent observations, per-agent rewards, safety reports, oversight report
+```
+### Single-Agent API (Backward Compatible)
+The original single-agent API (`/reset`, `/step`, `/state`, `/grader`) is fully preserved.
+---
+## What Each Agent Sees (POMDP Observation)
+Each agent receives a **partial** observation of their zone:
+| Field | Example | Meaning |
+|---|---|---|
+| `grid_frequency` | `49.87` | **Noisy** frequency reading (Gaussian noise added) |
+| `local_buses[].type` | `"solar"` | Bus type (only buses in agent's zone) |
+| `local_buses[].p_injection` | `35.2` | Power output in MW |
+| `boundary_lines[].rho` | `0.78` | Lines connecting to other zones |
+| `internal_lines[].flow` | `62.4` | Lines within agent's zone |
+| `neighbor_signals` | `{1: 12.5}` | Average injection of neighboring zones |
+| `zone_load_mw` | `85.3` | Total load in this zone |
+| `zone_gen_mw` | `42.1` | Total generation in this zone |
+Agents do **NOT** see buses or lines in other zones — they must coordinate through limited neighbor signals and the shared (but noisy) frequency reading.
+---
+## Safety Layer
+The safety layer validates every action BEFORE it reaches the physics engine:
+| Check | What It Does | If Violated |
+|---|---|---|
+| **Zone Boundary** | Agent can only adjust buses in their zone | Action removed |
+| **N-1 Security** | Grid must survive loss of any single line | Action blocked |
+| **Anti-Islanding** | Opening a line must not disconnect the grid | Switch blocked |
+| **Ramp Limits** | Power changes within physical ramp rates | Delta clamped |
+| **Capacity Limits** | Generation within min/max bounds | Output clamped |
+| **Battery SoC** | Can't discharge below 0 or charge above capacity | Delta clamped |
+Critically, unsafe actions are **projected to the nearest safe alternative** rather than simply rejected. This preserves the agent's intent while enforcing safety, and provides a richer training signal.
+---
+## Reward System
+Six composable, independent reward functions:
+| Component | Range | When |
+|---|---|---|
+| **survival** | +1.0 / -100.0 | Grid stays connected / blackout |
+| **frequency** | -1.5 to +0.2 | Based on deviation from 50 Hz |
+| **local_congestion** | ≤ 0 | Line overloads in agent's zone |
+| **safety_compliance** | -0.3 to +0.1 | Penalty if safety layer corrected action |
+| **coordination** | ≤ 0 | Penalty for selfish/conflicting actions |
+| **action_cost** | -0.5 / switch | Topology change cost |
+---
+## Scoring
+Scores are normalized to **(0.02 – 0.98)** using:
+```
+score = (agent_reward - worst_case) / (best_case - worst_case) + N1_bonus
+```
+| Bound | How It's Computed |
+|---|---|
+| **Worst case (floor)** | Random agent that chaotically switches lines — causes blackouts fast |
+| **Best case (ceiling)** | Theoretical perfect agent: survives every step + perfect frequency bonus |
+| **N-1 bonus** | Up to +10% for completing the episode without a blackout |
+### Baseline Scores (Heuristic Policy)
+| Task | Score | Strategy |
+|---|---|---|
+| `task_easy` | ~0.90 | Proportional frequency control, no line switching |
+| `task_medium` | ~0.98 | Same heuristic — medium grid happens to be well-balanced |
+| `task_hard` | ~0.98 | Same heuristic — hard grid has more buses but similar dynamics |
+| `task_karnataka` | ~0.98 | 15-bus real topology, 4 zones, generators warm-started |
+> Reproduce with: `python get_scores.py`
+---
+## Project Structure
+```
+OpenGrid/
+├── app.py                      # FastAPI server (single + multi-agent endpoints)
+├── inference.py                # LLM inference script
+├── get_scores.py               # Reproduce baseline scores
+├── openenv.yaml                # OpenEnv manifest
+├── Dockerfile                  # Container config
+├── requirements.txt            # Python dependencies
+│
+├── src/                        # Core environment
+│   ├── models.py               # Pydantic models (single + multi-agent)
+│   ├── environment.py          # Grid simulation (POMDP + backward-compatible)
+│   ├── physics.py              # DC power flow solver
+│   ├── tasks.py                # Procedural grid generation with zone assignment
+│   ├── grader.py               # Scoring (floor/ceiling normalization)
+│   ├── baseline.py             # Heuristic + LLM policies
+│   ├── safety.py               # Safety layer (N-1, anti-islanding, projection)
+│   ├── oversight.py            # Oversight agent (coordination monitoring)
+│   └── visualization.py        # Grid topology & frequency plots
+│
+├── training/                   # RL training pipeline
+│   ├── train_grpo.py           # TRL GRPO training script
+│   └── opengrid_grpo_colab.ipynb  # Google Colab notebook for GPU training
+│
+├── tests/                      # Test suite (28 tests)
+│   ├── test_solver.py          # Physics, environment, grader tests
+│   └── test_multi_agent.py     # Multi-agent, safety, oversight tests
+│
+├── static/                     # Dashboard frontend
+│   ├── index.html
+│   ├── style.css
+│   └── app.js
+│
+└── server/                     # Alternative entry point
+    └── app.py
+```
+---
+## Training Results (GRPO)
+We trained **Qwen 2.5 1.5B** using GRPO (Group Relative Policy Optimization) on the Karnataka grid topology.
+### Training Loss
+The loss converges from ~0.09 to near 0 by step ~400, confirming end-to-end training pipeline functionality.
+### Before vs After (Average Episode Reward)
+| Task | Heuristic Baseline | GRPO Trained |
+|---|---|---|
+| `task_easy` | 27.6 | 27.6 |
+| `task_medium` | 48.7 | 48.7 |
+| `task_karnataka` | 19.6 | -316.9 |
+**Key Finding**: Naive LLM training on simplified proxy rewards does not transfer to real-world grid topologies — Karnataka collapses to -316.9. This validates our architectural decision to pair RL agents with a **safety layer + oversight agent**. The heuristic baseline with safety corrections (19.6 reward, zero blackouts) outperforms pure RL, proving that critical infrastructure needs guardrails, not just learned policies.
+> **Reproduce training**: Open `training/opengrid_grpo_colab.ipynb` in Google Colab (T4 GPU)
+---
+## Technical Details
+<details>
+<summary><strong>Physics Engine</strong></summary>
+- **DC Power Flow** with B-matrix formulation (standard power systems approximation)
+- **Slack bus** absorbs generation/load imbalance after each power flow solve
+- **Islanding detection** via NetworkX graph connectivity checks
+- **Droop frequency model** calibrated to system size: `f = 50.0 - (2.5 / total_capacity) * P_slack`
+</details>
+<details>
+<summary><strong>Multi-Agent Design</strong></summary>
+- Buses partitioned into zones using **greedy modularity community detection** (NetworkX)
+- Each zone maps to a KPTCL transmission region (Bengaluru, Mysuru, Kalburagi)
+- **Partial observability**: agents see only local buses, boundary lines, noisy frequency
+- **Neighbor signals**: each agent receives average injection of adjacent zones
+- **Safety-first**: all actions validated by constraint filter before physics engine
+</details>
+<details>
+<summary><strong>Thread Safety</strong></summary>
+- All session reads/writes are protected by a `threading.Lock`
+- Grader bounds use double-checked locking to avoid duplicate rollouts
+- Safe for concurrent requests from multiple agents
+</details>
+<details>
+<summary><strong>Reproducibility</strong></summary>
+| Component | Mechanism |
+|---|---|
+| Task grids | Seeded procedural generation (`np.random.default_rng`) |
+| Zone partitioning | Deterministic community detection with seed |
+| Wind variability | Per-episode RNG (same seed → same wind pattern) |
+| Floor estimation | Seeded thrash policy + 10 diverse-seeded episodes |
+| Ceiling | Analytical formula (deterministic) |
+| Scoring | Shared `normalize_score()` across all endpoints |
+</details>
+---
+## Related Work
+- **Massgen**: When Multiple LLMs Think Together (Gradient Network, 2025)
+- **Symphony**: Multi-Agent Intelligence in a Collective Fabric (Gradient Network, 2025)
+- **Grid2Op**: Power grid RL environment (RTE, 2020)
+- **OpenEnv**: Standardized agentic execution environments (Scalar/HuggingFace/Meta, 2026)
+---
+## Links
+| Resource | URL |
+|---|---|
+| **Live Demo** | [huggingface.co/spaces/K446/Opengrid](https://huggingface.co/spaces/K446/Opengrid) |
+| **GitHub Repo** | [github.com/krishnagoyal099/Opengrid_env](https://github.com/krishnagoyal099/Opengrid_env) |
+| **API Docs (Swagger)** | [huggingface.co/spaces/K446/Opengrid/docs](https://k446-opengrid.hf.space/docs) |
+---
+## License
+MIT — see [LICENSE](LICENSE) for details.

app.py ADDED Viewed

	@@ -0,0 +1,416 @@

+from fastapi import FastAPI, HTTPException
+from fastapi.staticfiles import StaticFiles
+from fastapi.responses import FileResponse
+from typing import Dict, List
+from src.models import (
+    GridAction, GridObservation, GridReward,
+    MultiAgentAction, MultiAgentStepResult,
+)
+from src.environment import OpenGridEnv
+from src.tasks import TASKS
+from src.grader import RobustnessGrader, normalize_score, _SCORE_EPSILON, _clamp_score
+from src.baseline import heuristic_policy, llm_policy
+from src.visualization import generate_dashboard
+import copy
+import uuid
+import os
+import time
+import pathlib
+import threading
+import warnings
+app = FastAPI(
+    title="OpenGrid Environment",
+    description="Multi-agent renewable energy grid load-balancing environment with safety constraints",
+    version="2.0.0"
+)
+# Static files — mount only if present (allows API-only or test deployments)
+STATIC_DIR = pathlib.Path(__file__).parent / "static"
+if STATIC_DIR.exists():
+    app.mount("/static", StaticFiles(directory=str(STATIC_DIR)), name="static")
+else:
+    warnings.warn(
+        f"Static directory not found: {STATIC_DIR}. "
+        "Dashboard UI disabled; API endpoints remain available."
+    )
+# ---------------------------------------------------------------------------
+# Session storage with TTL + per-session locking
+# ---------------------------------------------------------------------------
+# _session_lock guards the sessions/history *dicts* for insert/delete/lookup.
+# Each session also has its own lock ("lock" key) that serializes env
+# operations, preventing race conditions when concurrent requests target
+# the same session (e.g. two /step calls, or /step racing with /grader).
+# ---------------------------------------------------------------------------
+sessions: Dict[str, Dict] = {}
+history: Dict[str, List] = {}
+MAX_SESSIONS = 100
+SESSION_TTL_SECONDS = 3600  # 1 hour
+_session_lock = threading.Lock()
+# Grader cache: bounds are expensive (10 rollouts per task), compute once.
+# Construction AND bounds estimation are serialized under _grader_lock.
+_grader_cache: Dict[str, RobustnessGrader] = {}
+_grader_lock = threading.Lock()
+def _new_session(env: OpenGridEnv, task_id: str, mode: str, **extra) -> dict:
+    """Create a session dict with per-session lock and metadata."""
+    session = {
+        "env": env,
+        "created": time.time(),
+        "last_access": time.time(),
+        "task_id": task_id,
+        "rewards": [],
+        "mode": mode,
+        "done": False,
+        "is_blackout": False,
+        "lock": threading.Lock(),
+    }
+    session.update(extra)
+    return session
+def _session_age(s: dict, now: float) -> float:
+    """Return the last-access timestamp for a session (for eviction sorting)."""
+    ts = s.get("last_access")
+    if ts is None:
+        ts = s.get("created")
+    return float(ts) if ts is not None else now
+def _cleanup_sessions():
+    """Evict expired and excess sessions. Caller must hold _session_lock."""
+    now = time.time()
+    # Phase 1: evict expired sessions (actual TTL)
+    expired = [
+        sid for sid, s in sessions.items()
+        if now - _session_age(s, now) > SESSION_TTL_SECONDS
+    ]
+    for sid in expired:
+        sessions.pop(sid, None)
+        history.pop(sid, None)
+    # Phase 2: evict oldest if still over limit
+    while len(sessions) >= MAX_SESSIONS:
+        oldest_sid = min(
+            sessions,
+            key=lambda k: _session_age(sessions[k], 0.0),
+        )
+        sessions.pop(oldest_sid, None)
+        history.pop(oldest_sid, None)
+def _get_session(session_id: str) -> dict:
+    """Look up session, update last_access, raise 404 if missing.
+    Caller must NOT hold _session_lock (this acquires it)."""
+    with _session_lock:
+        session = sessions.get(session_id)
+        if session is None:
+            raise HTTPException(404, "Session not found")
+        session["last_access"] = time.time()
+        return session
+def _get_grader(task_id: str) -> RobustnessGrader:
+    """Get or create a cached RobustnessGrader for a task.
+    Both construction and bounds estimation run under _grader_lock
+    so concurrent /grader requests don't duplicate or race on
+    _estimate_bounds() mutations.
+    """
+    with _grader_lock:
+        if task_id not in _grader_cache:
+            grader = RobustnessGrader(copy.deepcopy(TASKS[task_id]))
+            grader.get_bounds()  # force expensive mutation while locked
+            _grader_cache[task_id] = grader
+        return _grader_cache[task_id]
+@app.get("/")
+def root():
+    """Serve the interactive dashboard (or API info if static files absent)."""
+    index = STATIC_DIR / "index.html"
+    if index.exists():
+        return FileResponse(str(index))
+    return {"status": "OpenGrid API", "version": "2.0.0", "docs": "/docs"}
+@app.get("/health")
+def health():
+    """Health check endpoint (JSON)."""
+    return {"status": "OpenGrid Running", "version": "2.0.0", "docs": "/docs"}
+@app.get("/tasks")
+def get_tasks():
+    """List available tasks with metadata including multi-agent zone info."""
+    action_schema = GridAction.model_json_schema()
+    obs_schema = GridObservation.model_json_schema()
+    return [
+        {
+            "id": k,
+            "difficulty": v.get("difficulty", k.split('_')[1]),
+            "num_buses": v["num_buses"],
+            "max_steps": v["max_steps"],
+            "num_agents": v.get("num_agents", 1),
+            "zone_names": v.get("zone_names", []),
+            "buses": v.get("buses", []),
+            "action_schema": action_schema,
+            "observation_schema": obs_schema
+        } for k, v in TASKS.items()
+    ]
+# ===========================================================================
+# Single-Agent API (backward compatible)
+# ===========================================================================
+@app.post("/reset")
+def reset(task_id: str = "task_easy"):
+    """Reset (or create) an environment session. Returns initial observation."""
+    if task_id not in TASKS:
+        raise HTTPException(404, f"Task '{task_id}' not found. Available: {list(TASKS.keys())}")
+    env = OpenGridEnv(copy.deepcopy(TASKS[task_id]))
+    obs = env.reset()
+    sid = str(uuid.uuid4())
+    with _session_lock:
+        _cleanup_sessions()
+        sessions[sid] = _new_session(env, task_id, mode="single")
+        history[sid] = [obs]
+    return {"session_id": sid, "observation": obs.model_dump()}
+@app.post("/step")
+def step(session_id: str, action: GridAction):
+    """Execute one step in the environment."""
+    session = _get_session(session_id)
+    # Per-session lock serializes all env operations for this session
+    with session["lock"]:
+        if session.get("done"):
+            raise HTTPException(400, "Episode already done. Call /reset to start a new session.")
+        env = session["env"]
+        obs, reward, done, info = env.step(action)
+        session["rewards"].append(reward.value)
+        session["done"] = done
+        session["is_blackout"] = info.is_blackout
+        with _session_lock:
+            history[session_id].append(obs)
+    return {
+        "observation": obs.model_dump(),
+        "reward": reward.model_dump(),
+        "done": done,
+        "info": info.model_dump()
+    }
+@app.get("/state")
+def get_state(session_id: str):
+    """Get current state of a session."""
+    session = _get_session(session_id)
+    with session["lock"]:
+        return session["env"].state().model_dump()
+# ===========================================================================
+# Multi-Agent POMDP API
+# ===========================================================================
+@app.post("/reset_multi")
+def reset_multi(task_id: str = "task_easy"):
+    """Reset environment in multi-agent mode. Returns per-agent partial observations."""
+    if task_id not in TASKS:
+        raise HTTPException(404, f"Task '{task_id}' not found. Available: {list(TASKS.keys())}")
+    env = OpenGridEnv(copy.deepcopy(TASKS[task_id]))
+    zone_obs = env.reset_multi()
+    sid = str(uuid.uuid4())
+    zone_info = env.get_zone_info()
+    with _session_lock:
+        _cleanup_sessions()
+        sessions[sid] = _new_session(
+            env, task_id, mode="multi",
+            per_agent_rewards={i: [] for i in range(env.num_agents)},
+        )
+        # Store full-grid observation for visualization history
+        history[sid] = [env.state()]
+    return {
+        "session_id": sid,
+        "num_agents": env.num_agents,
+        "zone_info": {str(k): v.model_dump() for k, v in zone_info.items()},
+        "observations": {str(k): v.model_dump() for k, v in zone_obs.items()},
+    }
+@app.post("/step_multi")
+def step_multi(session_id: str, actions: MultiAgentAction):
+    """Multi-agent step with safety layer and oversight.
+    Each agent submits actions for their zone. The safety layer validates,
+    the oversight agent evaluates coordination, and per-agent rewards are computed.
+    """
+    session = _get_session(session_id)
+    with session["lock"]:
+        if session.get("done"):
+            raise HTTPException(400, "Episode already done. Call /reset_multi to start a new session.")
+        env = session["env"]
+        if session.get("mode") != "multi":
+            raise HTTPException(400, "Session not in multi-agent mode. Use /reset_multi first.")
+        # Convert string keys from JSON to int keys, with validation
+        agent_actions = {}
+        for k, v in actions.agent_actions.items():
+            try:
+                agent_id = int(k) if isinstance(k, str) else k
+            except (TypeError, ValueError):
+                raise HTTPException(400, f"Invalid agent_id: {k!r}")
+            if not (0 <= agent_id < env.num_agents):
+                raise HTTPException(
+                    400,
+                    f"Invalid agent_id {agent_id}; expected 0..{env.num_agents - 1}",
+                )
+            agent_actions[agent_id] = v
+        result = env.step_multi(agent_actions)
+        session["rewards"].append(result.team_reward)
+        session["done"] = result.done
+        session["is_blackout"] = result.info.is_blackout
+        for agent_id, reward in result.rewards.items():
+            if agent_id in session.get("per_agent_rewards", {}):
+                session["per_agent_rewards"][agent_id].append(reward.value)
+        # Store full-grid observation for visualization
+        with _session_lock:
+            history[session_id].append(env.state())
+    return {
+        "observations": {str(k): v.model_dump() for k, v in result.observations.items()},
+        "rewards": {str(k): v.model_dump() for k, v in result.rewards.items()},
+        "team_reward": result.team_reward,
+        "done": result.done,
+        "safety_reports": {str(k): v.model_dump() for k, v in result.safety_reports.items()},
+        "oversight_report": result.oversight_report.model_dump(),
+        "info": result.info.model_dump(),
+    }
+@app.get("/zones")
+def get_zones(session_id: str):
+    """Get zone assignments and agent info for a multi-agent session."""
+    session = _get_session(session_id)
+    with session["lock"]:
+        zone_info = session["env"].get_zone_info()
+    return {
+        "num_agents": session["env"].num_agents,
+        "zones": {str(k): v.model_dump() for k, v in zone_info.items()},
+    }
+# ===========================================================================
+# Grading & Baseline
+# ===========================================================================
+@app.get("/grader")
+def run_grader(session_id: str):
+    """
+    Grade a completed (or in-progress) session.
+    Returns a score strictly in the open interval (0, 1) using the same
+    normalization as the /baseline endpoint (analytical ceiling + empirical floor).
+    """
+    session = _get_session(session_id)
+    with session["lock"]:
+        rewards = list(session["rewards"])  # snapshot under lock
+        task_id = session["task_id"]
+        is_blackout = session.get("is_blackout", False)
+    if not rewards:
+        return {"score": _SCORE_EPSILON, "message": "No steps taken yet. Run /step first."}
+    cumulative = sum(rewards)
+    n_steps = len(rewards)
+    grader = _get_grader(task_id)
+    bounds = grader.get_bounds()
+    n1_rate = 0.0 if is_blackout else 1.0
+    score = normalize_score(
+        cumulative_reward=cumulative,
+        reward_floor=bounds["reward_floor"],
+        reward_ceiling=bounds["reward_ceiling"],
+        n1_survival_rate=n1_rate
+    )
+    # Defense-in-depth: clamp again at the API boundary
+    score = _clamp_score(score)
+    return {
+        "score": score,
+        "cumulative_reward": round(cumulative, 4),
+        "steps": n_steps,
+        "is_blackout": is_blackout,
+        "task_id": task_id,
+        "reward_floor": bounds["reward_floor"],
+        "reward_ceiling": bounds["reward_ceiling"]
+    }
+@app.get("/baseline")
+def run_baseline(use_llm: bool = False):
+    """
+    Run baseline policy on all registered tasks. Returns 0.0–1.0 scores.
+    Default: heuristic (reproducible). Set use_llm=true for LLM agent.
+    Uses the same cached grader as /grader — bounds are computed once
+    and reused across all endpoints.
+    """
+    api_key = os.getenv("HF_TOKEN", os.getenv("OPENAI_API_KEY", ""))
+    if use_llm and not api_key:
+        raise HTTPException(
+            400,
+            "use_llm=true requires HF_TOKEN or OPENAI_API_KEY environment variable",
+        )
+    policy = llm_policy if use_llm and api_key else heuristic_policy
+    policy_name = "llm" if policy is llm_policy else "heuristic"
+    results = {}
+    for task_id, config in TASKS.items():
+        grader = _get_grader(task_id)  # cached — no duplicate rollouts
+        res = grader.evaluate_policy(policy, n_episodes=3)
+        results[task_id] = res
+    return {"policy": policy_name, "baseline_scores": results}
+@app.get("/visualize")
+def visualize(session_id: str):
+    """Generate a visualization of the current grid state and frequency history."""
+    session = _get_session(session_id)
+    with session["lock"]:
+        obs = session["env"].state()
+        with _session_lock:
+            hist = list(history.get(session_id, []))
+    img_str = generate_dashboard(hist, obs)
+    return {"image_base64": img_str}

changes.md ADDED Viewed

	@@ -0,0 +1,111 @@

+# Notebook Changes — opengrid_grpo_colab.ipynb
+## Bug fixes applied (2026-04-25)
+### Cell 7 — Generate Training Prompts
+| # | Severity | Bug | Fix |
+|---|----------|-----|-----|
+| 1 | 🔴 Critical | `obs_dict = obs.model_dump()` produces dicts with integer keys; `Dataset.from_dict({"obs_context": obs_contexts})` fails with `ArrowTypeError: Expected dict key of type str or bytes, got 'int'` | Changed to `json.loads(obs.model_dump_json())` so all keys are strings; then stored as `json.dumps(obs_dict)` — a flat JSON string PyArrow handles trivially |
+| 2 | 🟡 Bug | `env = OpenGridEnv(task_config)` instantiated before the loop but immediately replaced inside the loop — wasted object creation | Removed stray instantiation |
+| 3 | 🟡 Bug | `import copy`, `import json` inside inner loop body — re-imported on every iteration | Moved to top of cell |
+| 4 | 🟡 Bug | Slack bus included in random action choices — physics solver overwrites it, wasting action budget | Filtered to `['generator', 'battery']` only |
+### Cell 8 — Reward Function
+| # | Severity | Bug | Fix |
+|---|----------|-----|-----|
+| 5 | 🔴 Critical | `reward_fn` received `obs_context` as JSON strings from the dataset column but passed them directly to `compute_grpo_reward` which expects dicts | Added `json.loads(ctx) if isinstance(ctx, str) else ctx` deserialization before scoring |
+| 6 | 🟡 Bug | No assertion to catch silent arity mismatches | Added `assert len(test_rewards) == 2` sanity check |
+### Cell 9 — Training
+| # | Severity | Bug | Fix |
+|---|----------|-----|-----|
+| 7 | 🟡 Bug | `bf16=torch.cuda.is_bf16_supported()` raises `AssertionError` when CUDA is not available (no GPU runtime) | Guarded: `_cuda_ok = torch.cuda.is_available()` then `_bf16 = _cuda_ok and ...` |
+### Cell 12 — Before/After Plot
+| # | Severity | Bug | Fix |
+|---|----------|-----|-----|
+| 8 | 🟡 Bug | Bar labels used `va='bottom'` for all bars; for negative-height bars the label renders inside/below the bar | Fixed: `va='bottom'` when `h >= 0`, `va='top'` when `h < 0`, with matching y-offset |
+### Cell 13 — Summary Table
+| # | Severity | Bug | Fix |
+|---|----------|-----|-----|
+| 9 | 🟡 Bug | `common_tasks` was set in Cell 12; if the user skips the plot cell, Cell 13 raises `NameError: common_tasks` | Rebuilt `common_tasks` defensively at the top of Cell 13 |
+---
+## `inference.py` — Code review fixes (2026-04-25)
+### High-priority fixes
+| # | Severity | Issue | Fix |
+|---|----------|-------|-----|
+| 1 | 🔴 Bug | `parse_action()` crashes on valid JSON that is not an object (e.g. `[]`) — `AttributeError` not caught by `except (json.JSONDecodeError, KeyError)` | Rewrote with `isinstance(data, dict)` guard, list-unwrapping, field-type validation, and broad `except Exception` |
+| 2 | 🔴 Bug | `parse_action()` markdown/prose stripping is fragile — fails on `Here is the action: {...}` | Extracts first `{...}` substring via `text.find("{")` / `text.rfind("}")` |
+| 3 | 🔴 Reliability | `/grader` call can exceed `httpx` 30s timeout on first use (lazy `RobustnessGrader` bound estimation) | `grade()` now uses `timeout=180.0`; base client uses `httpx.Timeout(connect=10, read=60, write=30, pool=10)` |
+| 4 | 🟡 Bug | `HF_TOKEN` takes precedence over `OPENAI_API_KEY` — if both set with OpenAI endpoint, auth fails | Changed to `API_KEY or OPENAI_API_KEY or HF_TOKEN` priority order |
+| 5 | 🟡 Bug | No JSON-mode enforcement for LLM — models return markdown/prose | Added `response_format={"type": "json_object"}` with fallback for unsupported endpoints |
+### System prompt fixes
+| # | Severity | Issue | Fix |
+|---|----------|-------|-----|
+| 6 | 🟡 Design | Prompt says slack bus is controllable, but physics solver overwrites it | Changed to: "avoid adjusting the slack bus — physics overwrites it" |
+| 7 | 🟡 Design | Single-agent mode allows topology actions without safety layer protection | Added: "Prefer NO topology actions unless absolutely necessary" |
+| 8 | 🟡 Design | Multi-agent prompt says "Only for lines in your zone" but observations include boundary lines | Clarified: "Only for visible internal or boundary lines. Boundary-line switching is risky" |
+### Multi-agent robustness fixes
+| # | Severity | Issue | Fix |
+|---|----------|-------|-----|
+| 9 | 🟡 Bug | Agent iteration uses `range(num_agents)` — assumes contiguous integer IDs | Changed to `sorted(observations.keys())` |
+| 10 | 🟡 Bug | `safety_reports` assumed to be list, but API returns dict keyed by agent ID | Added `isinstance` check to handle both list and dict formats |
+| 11 | 🟡 Design | Safety correction feedback not fed back to LLM — model repeats same invalid actions | Appended `[SAFETY] {reason}` to agent history when corrections occur |
+### Other fixes
+| # | Severity | Issue | Fix |
+|---|----------|-------|-----|
+| 12 | 🟡 Bug | `MAX_STEPS = 50` hardcoded — may truncate future tasks | Changed to `MAX_STEPS = 100` as safety cap; `done` flag is the true terminator |
+| 13 | 🟡 Bug | Default task list excludes `task_karnataka` despite KPTCL multi-agent framing | Added `task_karnataka` to `TASKS` list |
+| 14 | 🟡 Bug | Module docstring says all 3 env vars are required; only API key is | Fixed docstring to document defaults and actual requirements |
+| 15 | 🟡 Bug | `[END]` log prints score at `.2f` but summary prints `.4f` — precision loss | Changed `log_end` to use `:.4f` |
+| 16 | 🟡 Reliability | `OpenAI()` client has no timeout or retry config | Added `timeout=30.0, max_retries=2` |
+| 17 | 🟢 Feature | No `list_tasks()` method on `EnvClient` | Added `list_tasks()` for future task validation |
+---
+## GRPO Training — Environment-Grounded Rewards (2026-04-25)
+### Root Cause: Proxy Reward Disconnect
+The original `compute_grpo_reward` was a **heuristic proxy scorer** that evaluated JSON format, direction, and proportionality without ever stepping the environment. The model optimized this proxy, which did not correlate with actual grid physics rewards. Result: zero improvement over baseline.
+### Changes Made
+#### `src/environment.py`
+| # | Change | Purpose |
+|---|--------|---------|
+| 1 | Added `_set_state(obs_dict)` method to `OpenGridEnv` | Enables restoring environment to any observed state for reward computation. Rebuilds bus/line state, frequency, and slack injection from observation dicts. |
+#### `training/train_grpo.py`
+| # | Severity | Change | Details |
+|---|----------|--------|---------|
+| 2 | 🔴 Critical | Replaced `compute_grpo_reward` with `compute_grpo_reward_env` | New reward function **actually steps the physics simulation**: restores env state → steps with LLM action → measures real reward → runs mini-rollout with heuristic continuation for trajectory awareness |
+| 3 | 🔴 Critical | Added mini-rollout scoring (horizon=3) | After the LLM's action, runs 2 more steps with heuristic policy to capture trajectory-level impact. Combines: `immediate_reward + 0.5 * rollout_reward` |
+| 4 | 🟡 Medium | Increased `num_generations` from 4 → 8 | Wider GRPO group = more reward variance = stronger ranking signal. Prevents the advantage calculation from collapsing to zero. |
+| 5 | 🟡 Medium | Increased random perturbation range from ±15 → ±30 MW | Creates more diverse/stressed grid states during training data generation. Model sees near-blackout and overload scenarios. |
+| 6 | 🟡 Medium | Added adversarial battery drain (every 5th episode) | Forces model to learn actions when batteries are near-empty — a critical edge case the original data lacked. |
+| 7 | 🟡 Medium | Multi-bus perturbations (1-2 buses per step) | Was single-bus. More diverse action patterns create richer state transitions. |
+| 8 | 🟡 Medium | Increased learning rate from 5e-6 → 1e-5 | Slightly more aggressive to capitalize on the now-meaningful reward signal. |
+| 9 | 🟡 Medium | Increased gradient accumulation (effective batch 16) | Smoother gradients for more stable training. |
+| 10 | 🟡 Medium | Steps per episode increased from 10 → 15 | More temporal diversity in observations. |
+| 11 | 🟢 Minor | obs_context stored as JSON string | Fixes Arrow serialization (PyArrow can't handle dicts with int keys). |
+| 12 | 🟢 Minor | Kept legacy `compute_grpo_reward` for test-mode compat | Backward compatibility with `--test-mode` pipeline verification. |

inference.py ADDED Viewed

	@@ -0,0 +1,535 @@

+"""
+OpenGrid Inference Script
+=========================
+Runs an LLM agent against all OpenGrid tasks via the OpenAI-compatible API.
+Supports both single-agent and multi-agent POMDP modes.
+Optional environment variables:
+  API_BASE_URL   -- defaults to https://api.openai.com/v1
+  MODEL_NAME     -- defaults to gpt-4o
+Required (one of):
+  OPENAI_API_KEY or HF_TOKEN
+Emits structured [START], [STEP], [END] logs to stdout.
+Usage:
+  # Single-agent mode (backward compatible)
+  python inference.py
+  # Multi-agent mode (uses safety layer + oversight)
+  python inference.py --multi
+"""
+import os
+import sys
+import json
+import math
+import argparse
+import httpx
+from openai import OpenAI
+# ---------- Configuration ----------
+API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
+MODEL_NAME = os.environ.get("MODEL_NAME", "gpt-4o")
+# Prefer OPENAI_API_KEY when using OpenAI endpoint; otherwise try HF_TOKEN
+API_KEY = (
+    os.environ.get("API_KEY")
+    or os.environ.get("OPENAI_API_KEY")
+    or os.environ.get("HF_TOKEN")
+    or ""
+)
+ENV_URL = os.environ.get("ENV_URL", "http://localhost:7860")
+BENCHMARK = "OpenGrid"
+# Safety cap — the environment's 'done' flag is the true terminator
+MAX_STEPS = 100
+SUCCESS_SCORE_THRESHOLD = 0.5
+TASKS = ["task_easy", "task_medium", "task_hard", "task_karnataka"]
+SYSTEM_PROMPT_SINGLE = """You are a Power Grid Controller AI. Your goal is to maintain grid stability.
+Key objectives:
+1. Keep grid frequency close to 50.0 Hz (acceptable: 49.5-50.5 Hz)
+2. Prevent transmission line overloads (rho < 1.0)
+3. Avoid grid islanding (blackout)
+Available actions:
+1. bus_adjustments: List of {"bus_id": int, "delta": float}
+   - Positive delta = increase power injection (discharge battery / ramp up generator)
+   - Negative delta = decrease power injection (charge battery / ramp down generator)
+   - Only works on battery and generator buses (avoid adjusting the slack bus — physics overwrites it)
+2. topology_actions: List of {"line_id": str, "action": "open" | "close"}
+   - Opening a line removes it; closing reconnects. 3-step cooldown.
+   - WARNING: Opening lines can cause islanding -> blackout
+   - Prefer NO topology actions unless absolutely necessary. Always return "topology_actions": []
+Strategy:
+- If frequency < 50 Hz -> discharge batteries, ramp up generators
+- If frequency > 50 Hz -> charge batteries, ramp down generators
+- If a line rho > 0.9 -> reduce generation near that line, do NOT open it
+- Prefer minimal actions over aggressive switching
+Respond with ONLY a valid JSON object. Example:
+{"bus_adjustments": [{"bus_id": 2, "delta": 5.0}], "topology_actions": []}
+"""
+SYSTEM_PROMPT_MULTI = """You are a KPTCL Zone Controller AI managing one zone of the Karnataka power grid.
+You can only see and control buses in YOUR zone. Other zones are managed by other agents.
+Key objectives:
+1. Keep grid frequency close to 50.0 Hz (you see a noisy reading)
+2. Prevent line overloads in your zone (rho < 1.0)
+3. Coordinate with other zones (don't fight against them)
+4. Avoid actions that would trigger the safety layer
+Available actions:
+1. bus_adjustments: List of {"bus_id": int, "delta": float}
+   - ONLY adjust battery and generator buses in YOUR zone (avoid slack — physics overwrites it)
+   - Positive delta = increase power injection
+   - Negative delta = decrease power injection
+2. topology_actions: List of {"line_id": str, "action": "open" | "close"}
+   - Only for visible internal or boundary lines. Safety layer will block dangerous switches.
+   - Boundary-line switching is risky; avoid unless necessary.
+Strategy:
+- If frequency < 50 Hz -> increase generation/discharge in your zone
+- If frequency > 50 Hz -> decrease generation/charge in your zone
+- Check neighbor signals to understand if other zones are compensating
+- Prefer small corrections over large swings
+Respond with ONLY a valid JSON object. Example:
+{"bus_adjustments": [{"bus_id": 2, "delta": 5.0}], "topology_actions": []}
+"""
+# ---------- Structured Logging ----------
+def log_start(task: str, env: str, model: str, mode: str = "single"):
+    print(f"[START] task={task} env={env} model={model} mode={mode}", flush=True)
+def log_step(step: int, action: str, reward: float, done: bool, error=None, agent_id=None):
+    done_val = str(done).lower()
+    error_val = str(error) if error else "null"
+    agent_str = f" agent={agent_id}" if agent_id is not None else ""
+    print(
+        f"[STEP] step={step}{agent_str} action={action} reward={reward:.2f} done={done_val} error={error_val}",
+        flush=True,
+    )
+def clamp_score(s: float) -> float:
+    """Ensure score is strictly in (0, 1). Mirrors grader._clamp_score."""
+    try:
+        s = float(s)
+    except (TypeError, ValueError):
+        return 0.5
+    if not math.isfinite(s):
+        return 0.5
+    s = max(0.02, min(0.98, s))
+    s = math.floor(s * 10000) / 10000
+    return max(0.02, min(0.98, s))
+def log_end(success: bool, steps: int, score: float, rewards: list, mode: str = "single"):
+    clamped = clamp_score(score)
+    success_val = str(success).lower()
+    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
+    print(
+        f"[END] success={success_val} steps={steps} score={clamped:.4f} rewards={rewards_str} mode={mode}",
+        flush=True,
+    )
+# ---------- LLM Call ----------
+def get_model_message(client: OpenAI, step: int, obs_json: str, last_reward: float,
+                      history: list, system_prompt: str, zone_name: str = None) -> str:
+    """Ask the LLM what action to take given the current observation."""
+    context = ""
+    if zone_name:
+        context += f"[Zone: {zone_name}] "
+    context += f"Step {step} | Last reward: {last_reward:+.2f}\n"
+    if history:
+        context += "Recent history (last 3):\n" + "\n".join(history[-3:]) + "\n\n"
+    context += f"Current Grid State:\n{obs_json}"
+    try:
+        kwargs = dict(
+            model=MODEL_NAME,
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": context}
+            ],
+            temperature=0.0,
+            max_tokens=300,
+        )
+        # Use JSON mode if the endpoint supports it (OpenAI-compatible)
+        try:
+            kwargs["response_format"] = {"type": "json_object"}
+            response = client.chat.completions.create(**kwargs)
+        except Exception:
+            # Fallback: endpoint may not support response_format
+            kwargs.pop("response_format", None)
+            response = client.chat.completions.create(**kwargs)
+        return response.choices[0].message.content.strip()
+    except Exception as exc:
+        print(f"[DEBUG] Model request failed: {exc}", flush=True)
+        return '{"bus_adjustments": [], "topology_actions": []}'
+# ---------- Environment Client ----------
+class EnvClient:
+    """HTTP client for the OpenGrid FastAPI environment."""
+    def __init__(self, base_url: str):
+        self.base_url = base_url.rstrip("/")
+        self.client = httpx.Client(
+            timeout=httpx.Timeout(connect=10.0, read=60.0, write=30.0, pool=10.0)
+        )
+        self.session_id = None
+    # --- Single-Agent ---
+    def reset(self, task_id: str) -> dict:
+        resp = self.client.post(f"{self.base_url}/reset", params={"task_id": task_id})
+        resp.raise_for_status()
+        data = resp.json()
+        self.session_id = data["session_id"]
+        return data["observation"]
+    def step(self, action_dict: dict) -> dict:
+        resp = self.client.post(
+            f"{self.base_url}/step",
+            params={"session_id": self.session_id},
+            json=action_dict
+        )
+        resp.raise_for_status()
+        return resp.json()
+    # --- Multi-Agent ---
+    def reset_multi(self, task_id: str) -> dict:
+        resp = self.client.post(f"{self.base_url}/reset_multi", params={"task_id": task_id})
+        resp.raise_for_status()
+        data = resp.json()
+        self.session_id = data["session_id"]
+        return data
+    def step_multi(self, agent_actions: dict) -> dict:
+        resp = self.client.post(
+            f"{self.base_url}/step_multi",
+            params={"session_id": self.session_id},
+            json={"agent_actions": agent_actions}
+        )
+        resp.raise_for_status()
+        return resp.json()
+    # --- Shared ---
+    def state(self) -> dict:
+        resp = self.client.get(f"{self.base_url}/state", params={"session_id": self.session_id})
+        resp.raise_for_status()
+        return resp.json()
+    def grade(self) -> dict:
+        # Grading can trigger lazy bound estimation (multiple rollouts) — use long timeout
+        resp = self.client.get(
+            f"{self.base_url}/grader",
+            params={"session_id": self.session_id},
+            timeout=180.0,
+        )
+        resp.raise_for_status()
+        return resp.json()
+    def list_tasks(self) -> list:
+        """Fetch available tasks from the server."""
+        resp = self.client.get(f"{self.base_url}/tasks")
+        resp.raise_for_status()
+        return resp.json()
+    def close(self):
+        self.client.close()
+# ---------- Parse Action ----------
+NOOP_ACTION = {"bus_adjustments": [], "topology_actions": []}
+def parse_action(response_text: str) -> dict:
+    """Parse LLM JSON response into an action dict.
+    Handles markdown fences, prose preambles, JSON lists, and malformed output.
+    """
+    try:
+        text = str(response_text).strip()
+        # Strip markdown code fences
+        if text.startswith("```"):
+            lines = text.splitlines()
+            if lines and lines[0].startswith("```"):
+                lines = lines[1:]
+            if lines and lines[-1].startswith("```"):
+                lines = lines[:-1]
+            text = "\n".join(lines).strip()
+        # Extract first JSON object from any surrounding prose
+        start = text.find("{")
+        end = text.rfind("}")
+        if start < 0 or end <= start:
+            return dict(NOOP_ACTION)
+        data = json.loads(text[start:end + 1])
+        # Handle list wrapping (e.g. [{...}])
+        if isinstance(data, list):
+            data = data[0] if data else {}
+        if not isinstance(data, dict):
+            return dict(NOOP_ACTION)
+        bus_adjustments = data.get("bus_adjustments", [])
+        topology_actions = data.get("topology_actions", [])
+        if not isinstance(bus_adjustments, list):
+            bus_adjustments = []
+        if not isinstance(topology_actions, list):
+            topology_actions = []
+        return {
+            "bus_adjustments": bus_adjustments,
+            "topology_actions": topology_actions,
+        }
+    except Exception:
+        return dict(NOOP_ACTION)
+# ---------- Single-Agent Runner ----------
+def run_task_single(client: OpenAI, env: EnvClient, task_id: str) -> dict:
+    """Run one task in single-agent mode and return results."""
+    history_msgs = []
+    rewards = []
+    steps_taken = 0
+    score = 0.05
+    success = False
+    log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME, mode="single")
+    try:
+        obs = env.reset(task_id)
+        last_reward = 0.0
+        for step_num in range(1, MAX_STEPS + 1):
+            obs_json = json.dumps(obs, indent=2)
+            message = get_model_message(client, step_num, obs_json, last_reward,
+                                         history_msgs, SYSTEM_PROMPT_SINGLE)
+            action_dict = parse_action(message)
+            result = env.step(action_dict)
+            obs = result["observation"]
+            reward = result.get("reward", {}).get("value", 0.0)
+            done = result.get("done", False)
+            rewards.append(reward)
+            steps_taken = step_num
+            last_reward = reward
+            action_summary = json.dumps(action_dict)
+            if len(action_summary) > 200:
+                action_summary = action_summary[:200] + "..."
+            log_step(step=step_num, action=action_summary, reward=reward, done=done)
+            history_msgs.append(f"Step {step_num}: action={action_summary[:80]} -> reward {reward:+.2f}")
+            if done:
+                break
+        grade_result = env.grade()
+        score = clamp_score(grade_result.get("score", 0.5))
+        success = score >= SUCCESS_SCORE_THRESHOLD
+    except Exception as e:
+        print(f"[DEBUG] Task {task_id} error: {e}", flush=True)
+        score = 0.05
+        success = False
+    log_end(success=success, steps=steps_taken, score=score, rewards=rewards, mode="single")
+    return {"task": task_id, "score": score, "steps": steps_taken, "success": success}
+# ---------- Multi-Agent Runner ----------
+def run_task_multi(client: OpenAI, env: EnvClient, task_id: str) -> dict:
+    """Run one task in multi-agent mode and return results."""
+    rewards = []
+    steps_taken = 0
+    score = 0.05
+    success = False
+    total_safety_interventions = 0
+    log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME, mode="multi")
+    try:
+        reset_data = env.reset_multi(task_id)
+        num_agents = reset_data["num_agents"]
+        zone_info = reset_data["zone_info"]
+        observations = reset_data["observations"]
+        # Per-agent history
+        agent_histories = {str(i): [] for i in range(num_agents)}
+        last_rewards = {str(i): 0.0 for i in range(num_agents)}
+        print(f"[INFO] Multi-agent mode: {num_agents} agents", flush=True)
+        for aid, zi in zone_info.items():
+            print(f"  Agent {aid}: {zi['zone_name']} ({len(zi['bus_ids'])} buses)", flush=True)
+        for step_num in range(1, MAX_STEPS + 1):
+            agent_actions = {}
+            # Each agent generates its own action based on partial observation
+            for agent_id_str in sorted(observations.keys()):
+                obs = observations.get(agent_id_str, {})
+                zone_name = zone_info.get(agent_id_str, {}).get("zone_name", f"Zone_{agent_id_str}")
+                obs_json = json.dumps(obs, indent=2)
+                message = get_model_message(
+                    client, step_num, obs_json,
+                    last_rewards[agent_id_str],
+                    agent_histories[agent_id_str],
+                    SYSTEM_PROMPT_MULTI,
+                    zone_name=zone_name
+                )
+                action_dict = parse_action(message)
+                agent_actions[agent_id_str] = action_dict
+            # Submit all actions together
+            result = env.step_multi(agent_actions)
+            observations = result["observations"]
+            team_reward = result.get("team_reward", 0.0)
+            done = result.get("done", False)
+            # Track safety interventions
+            safety_reports = result.get("safety_reports", {})
+            if isinstance(safety_reports, list):
+                # Handle list format from older API
+                step_interventions = sum(1 for sr in safety_reports if sr.get("was_corrected", False))
+            else:
+                step_interventions = sum(
+                    1 for sr in safety_reports.values() if sr.get("was_corrected", False)
+                )
+            total_safety_interventions += step_interventions
+            # Feed safety correction feedback into agent histories
+            if isinstance(safety_reports, dict):
+                for aid_str, sr in safety_reports.items():
+                    if sr.get("was_corrected") and aid_str in agent_histories:
+                        reason = sr.get("correction_reason", "action corrected")[:120]
+                        agent_histories[aid_str].append(f"[SAFETY] {reason}")
+            # Log per-agent rewards
+            per_agent_rewards = result.get("rewards", {})
+            for agent_id_str in sorted(observations.keys()):
+                agent_reward = per_agent_rewards.get(agent_id_str, {}).get("value", 0.0)
+                last_rewards[agent_id_str] = agent_reward
+                action_summary = json.dumps(agent_actions.get(agent_id_str, {}))
+                if len(action_summary) > 100:
+                    action_summary = action_summary[:100] + "..."
+                agent_histories[agent_id_str].append(
+                    f"Step {step_num}: action={action_summary[:60]} -> reward {agent_reward:+.2f}"
+                )
+            rewards.append(team_reward)
+            steps_taken = step_num
+            # Log team-level step
+            oversight = result.get("oversight_report", {})
+            coord_score = oversight.get("coordination_score", 1.0)
+            safety_str = f" safety_corrections={step_interventions}" if step_interventions > 0 else ""
+            log_step(step=step_num, action=f"team_reward={team_reward:.2f} coord={coord_score:.2f}{safety_str}",
+                     reward=team_reward, done=done)
+            if done:
+                break
+        grade_result = env.grade()
+        score = clamp_score(grade_result.get("score", 0.5))
+        success = score >= SUCCESS_SCORE_THRESHOLD
+    except Exception as e:
+        print(f"[DEBUG] Task {task_id} multi-agent error: {e}", flush=True)
+        score = 0.05
+        success = False
+    print(f"[INFO] Total safety interventions: {total_safety_interventions}", flush=True)
+    log_end(success=success, steps=steps_taken, score=score, rewards=rewards, mode="multi")
+    return {
+        "task": task_id, "score": score, "steps": steps_taken,
+        "success": success, "safety_interventions": total_safety_interventions
+    }
+# ---------- Main ----------
+def main():
+    """Run inference on all tasks."""
+    parser = argparse.ArgumentParser(description="OpenGrid LLM Inference")
+    parser.add_argument("--multi", action="store_true",
+                        help="Use multi-agent POMDP mode (default: single-agent)")
+    parser.add_argument("--tasks", nargs="+", default=TASKS,
+                        help="Which tasks to run (default: all)")
+    args = parser.parse_args()
+    if not API_KEY:
+        print("[ERROR] No API key found. Set OPENAI_API_KEY or HF_TOKEN environment variable.", flush=True)
+        sys.exit(1)
+    mode = "multi-agent" if args.multi else "single-agent"
+    print(f"[CONFIG] API_BASE_URL={API_BASE_URL}", flush=True)
+    print(f"[CONFIG] MODEL_NAME={MODEL_NAME}", flush=True)
+    print(f"[CONFIG] ENV_URL={ENV_URL}", flush=True)
+    print(f"[CONFIG] MODE={mode}", flush=True)
+    client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY, timeout=30.0, max_retries=2)
+    env = EnvClient(ENV_URL)
+    all_results = []
+    runner = run_task_multi if args.multi else run_task_single
+    try:
+        for task_id in args.tasks:
+            print(f"\n{'='*60}", flush=True)
+            print(f"Running task: {task_id} ({mode})", flush=True)
+            print(f"{'='*60}", flush=True)
+            result = runner(client, env, task_id)
+            all_results.append(result)
+    finally:
+        env.close()
+    # Summary
+    print(f"\n{'='*60}", flush=True)
+    print(f"FINAL RESULTS ({mode})", flush=True)
+    print(f"{'='*60}", flush=True)
+    for r in all_results:
+        status = "PASS" if r["success"] else "FAIL"
+        extra = ""
+        if "safety_interventions" in r:
+            extra = f"  safety={r['safety_interventions']}"
+        print(f"  {r['task']}: score={r['score']:.4f}  steps={r['steps']}  [{status}]{extra}", flush=True)
+    avg_score = sum(r["score"] for r in all_results) / len(all_results) if all_results else 0
+    print(f"\n  Average Score: {avg_score:.4f}", flush=True)
+if __name__ == "__main__":
+    main()

openenv.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+spec_version: 1
+name: opengrid
+type: space
+runtime: fastapi
+app: app:app
+port: 7860
+# Environment supports both single-agent and multi-agent POMDP modes.
+# Single-agent: /reset + /step (backward compatible)
+# Multi-agent:  /reset_multi + /step_multi (2-3 agents per zone)
+tasks:
+  - id: task_easy
+    name: Easy Grid (5 buses, 2 agents, 20% renewables)
+    description: Basic frequency control with 2-zone coordination
+    agents: 2
+    grader:
+      endpoint: /grader
+      score_range: [0.02, 0.98]
+  - id: task_medium
+    name: Medium Grid (10 buses, 3 agents, 50% renewables)
+    description: Congestion management with 3-zone POMDP and volatile renewables
+    agents: 3
+    grader:
+      endpoint: /grader
+      score_range: [0.02, 0.98]
+  - id: task_hard
+    name: Hard Grid (14 buses, 3 agents, 70% renewables)
+    description: High volatility, tight margins, complex topology with safety constraints
+    agents: 3
+    grader:
+      endpoint: /grader
+      score_range: [0.02, 0.98]
+  - id: task_karnataka
+    name: Karnataka KPTCL Grid (5 buses, 2 agents, real-world topology)
+    description: Realistic Karnataka power grid with POMDP multi-agent coordination
+    agents: 2
+    grader:
+      endpoint: /grader
+      score_range: [0.02, 0.98]

pyproject.toml ADDED Viewed

	@@ -0,0 +1,41 @@

+[build-system]
+requires = ["setuptools>=68.0", "wheel"]
+build-backend = "setuptools.backends._legacy:_Backend"
+[project]
+name = "opengrid"
+version = "1.0.0"
+description = "Renewable energy grid load-balancing environment for AI agents"
+readme = "README.md"
+license = {text = "MIT"}
+requires-python = ">=3.10"
+authors = [
+    {name = "KRISHNA GOYAL", email = "krishnagoyalcse@gmail.com"}
+]
+dependencies = [
+    "fastapi",
+    "uvicorn",
+    "pydantic>=2.0",
+    "numpy",
+    "networkx",
+    "matplotlib",
+    "openai",
+    "httpx",
+    "openenv-core>=0.2.0",
+]
+[project.urls]
+Homepage = "https://github.com/K446/opengrid"
+[project.scripts]
+server = "server.app:main"
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["src*", "server*"]
+[tool.pyright]
+venvPath = "."
+venv = ".venv"
+pythonVersion = "3.13"
+extraPaths = ["."]

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+fastapi
+uvicorn[standard]
+pydantic>=2.0
+numpy
+networkx
+matplotlib
+openai
+httpx
+openenv-core>=0.2.0

server/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Server package

server/app.py ADDED Viewed

	@@ -0,0 +1,21 @@

+"""
+OpenGrid server entry point — used by openenv for multi-mode deployment.
+Re-exports the FastAPI app from the root app module.
+"""
+import sys
+import os
+import uvicorn
+# Add parent directory to path so we can import from the root package
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from app import app  # type: ignore[import-not-found]  # noqa: E402, F401
+def main():
+    """Entry point for openenv server mode."""
+    uvicorn.run(app, host="0.0.0.0", port=7860)
+if __name__ == "__main__":
+    main()

src/__init__.py ADDED Viewed

File without changes

src/baseline.py ADDED Viewed

	@@ -0,0 +1,199 @@

+"""
+Baseline Policies for OpenGrid
+================================
+Provides two agent implementations:
+1. heuristic_policy — deterministic rule-based baseline for reproducible scoring
+2. llm_policy — LLM-based policy using OpenAI-compatible API
+Both support GridObservation (single-agent) and ZoneObservation (multi-agent).
+"""
+import json
+import logging
+import os
+from typing import List, Union
+from openai import OpenAI
+from .models import GridAction, BusAdjustment, GridObservation, ZoneObservation
+logger = logging.getLogger(__name__)
+# API configuration — HF_TOKEN for Hugging Face endpoints, OPENAI_API_KEY for OpenAI
+API_BASE_URL = os.getenv("API_BASE_URL", "https://api.openai.com/v1")
+MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4o")
+API_KEY = os.getenv("OPENAI_API_KEY", os.getenv("HF_TOKEN", ""))
+# Cached client instance
+_CLIENT = None
+def _get_client() -> OpenAI:
+    """Lazy-cached client creation."""
+    global _CLIENT
+    if _CLIENT is None:
+        if not API_KEY:
+            raise RuntimeError(
+                "Missing API key. Set OPENAI_API_KEY or HF_TOKEN environment variable."
+            )
+        _CLIENT = OpenAI(base_url=API_BASE_URL, api_key=API_KEY, timeout=15.0)
+    return _CLIENT
+def _obs_buses(obs):
+    """Extract bus list from either GridObservation or ZoneObservation."""
+    return getattr(obs, "buses", getattr(obs, "local_buses", []))
+def _obs_lines(obs):
+    """Extract line list from either GridObservation or ZoneObservation."""
+    if hasattr(obs, "lines"):
+        return obs.lines
+    internal = getattr(obs, "internal_lines", [])
+    boundary = getattr(obs, "boundary_lines", [])
+    return list(internal) + list(boundary)
+SYSTEM_PROMPT = """You are a Power Grid Controller AI. Your goal is to maintain grid stability.
+Key objectives:
+1. Keep grid frequency close to 50.0 Hz (acceptable: 49.5–50.5 Hz)
+2. Prevent transmission line overloads (rho < 1.0)
+3. Avoid grid islanding (blackout)
+Available actions:
+1. bus_adjustments: List of {"bus_id": int, "delta": float}
+   - Positive delta = increase power injection (discharge battery / ramp up generator)
+   - Negative delta = decrease power injection (charge battery / ramp down generator)
+   - Only works on battery and generator buses (NOT slack, load, solar, or wind)
+   - Slack bus injection is computed by physics — adjustments are ignored
+2. topology_actions: List of {"line_id": str, "action": "open" | "close"}
+   - Opening a line removes it; closing reconnects. 3-step cooldown after each switch.
+   - WARNING: Opening lines can cause islanding → blackout → -100 reward
+   - Prefer NO topology actions unless absolutely necessary.
+Strategy tips:
+- If frequency < 50 Hz: grid needs more generation → discharge batteries or ramp up generators
+- If frequency > 50 Hz: grid has excess generation → charge batteries or ramp down generators
+- If a line rho > 0.9: reduce generation at one end or increase at the other to shift flow
+- Prefer minimal actions. Do-nothing is better than reckless switching.
+Respond with ONLY a valid JSON object, no markdown, no explanation. Example:
+{"bus_adjustments": [{"bus_id": 2, "delta": 5.0}], "topology_actions": []}
+"""
+def parse_action_response(response_text: str) -> GridAction:
+    """Parse LLM response into a GridAction. Falls back to no-op on parse errors."""
+    try:
+        text = response_text.strip()
+        # Remove fenced code block if present
+        if text.startswith("```"):
+            lines = text.splitlines()
+            if lines[0].startswith("```"):
+                lines = lines[1:]
+            if lines and lines[-1].startswith("```"):
+                lines = lines[:-1]
+            text = "\n".join(lines).strip()
+        # Extract first JSON object
+        start = text.find("{")
+        end = text.rfind("}")
+        if start == -1 or end == -1 or end <= start:
+            return GridAction()
+        data = json.loads(text[start:end + 1])
+        # Handle list wrapping
+        if isinstance(data, list):
+            data = data[0] if data else {}
+        return GridAction(**data)
+    except Exception:
+        return GridAction()
+def llm_policy(obs: Union[GridObservation, ZoneObservation]) -> GridAction:
+    """LLM-based policy using the OpenAI-compatible API.
+    Supports both GridObservation and ZoneObservation.
+    Falls back to no-op on any error.
+    """
+    client = _get_client()
+    obs_json = obs.model_dump_json()
+    try:
+        response = client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": f"Current Grid State:\n{obs_json}"}
+            ],
+            temperature=0.0,
+            max_tokens=300,
+        )
+        action_str = response.choices[0].message.content
+        return parse_action_response(action_str)
+    except Exception as e:
+        logger.debug("LLM policy error: %s", e, exc_info=True)
+        return GridAction()
+def heuristic_policy(
+    obs: Union[GridObservation, ZoneObservation],
+) -> GridAction:
+    """Rule-based baseline policy for reproducible scoring.
+    Strategy:
+    - Use batteries and generators for frequency regulation (proportional control)
+    - DO NOT open overloaded lines (causes cascading failures)
+    - DO NOT adjust the slack bus (overwritten by physics solver)
+    - Let the environment/safety layer clamp any out-of-range deltas
+    Supports both GridObservation (single-agent) and ZoneObservation (multi-agent).
+    """
+    adj = []
+    freq = obs.grid_frequency
+    freq_error = freq - 50.0  # positive = too high, negative = too low
+    buses = list(_obs_buses(obs))
+    lines = list(_obs_lines(obs))
+    batteries = [b for b in buses if b.type == 'battery']
+    generators = [b for b in buses if b.type == 'generator']
+    # --- 1. Proportional frequency control via batteries ---
+    if abs(freq_error) > 0.1 and batteries:
+        # Distribute correction across all available batteries
+        correction_total = -freq_error * 15.0  # stronger gain than naive 2.0
+        correction_total = max(-20.0, min(20.0, correction_total))
+        per_battery = correction_total / len(batteries)
+        for bus in batteries:
+            if per_battery > 0 and bus.soc > 0:
+                # Discharge — safety layer clamps to actual SOC
+                adj.append(BusAdjustment(bus_id=bus.id, delta=per_battery))
+            elif per_battery < 0:
+                # Charge — safety layer clamps to remaining capacity
+                adj.append(BusAdjustment(bus_id=bus.id, delta=per_battery))
+    # --- 2. Generator response for larger deviations ---
+    if abs(freq_error) > 0.25:
+        for bus in generators:
+            delta = -freq_error * 5.0
+            ramp = getattr(bus, 'ramp_rate', 20.0)
+            delta = max(-ramp, min(ramp, delta))
+            adj.append(BusAdjustment(bus_id=bus.id, delta=delta))
+    # --- 3. Overload relief via generators (not slack) ---
+    adjusted_for_overload = set()
+    for line in lines:
+        if line.rho > 0.95 and line.connected:
+            for bus in generators:
+                if bus.id not in adjusted_for_overload and bus.p_injection > 5:
+                    adj.append(BusAdjustment(bus_id=bus.id, delta=-3.0))
+                    adjusted_for_overload.add(bus.id)
+                    break
+    # No topology actions — much safer than opening overloaded lines
+    return GridAction(bus_adjustments=adj, topology_actions=[])

src/environment.py ADDED Viewed

	@@ -0,0 +1,672 @@

+import numpy as np
+import math
+from typing import List, Dict, Tuple, Optional
+from .models import (
+    GridObservation, GridAction, GridReward, GridInfo,
+    LineStatus, BusState, ZoneObservation, ZoneInfo,
+    SafetyReport, OversightReport, MultiAgentStepResult,
+)
+from .physics import DCSolver, IslandedException
+from .safety import SafetyLayer
+from .oversight import OversightAgent
+class OpenGridEnv:
+    """
+    OpenGrid: A renewable energy grid load-balancing environment.
+    Supports two modes:
+    1. Single-agent (backward compatible): reset()/step()/state()
+    2. Multi-agent POMDP: reset_multi()/step_multi() with per-zone
+       partial observability, safety layer, and oversight agent.
+    The agent(s) must maintain grid stability by:
+    - Balancing generation and load (frequency control)
+    - Managing transmission line loading (congestion management)
+    - Coordinating battery storage and topology switching
+    """
+    NOMINAL_FREQ = 50.0
+    FREQ_DEADBAND = 0.5  # Hz — acceptable deviation band
+    FREQ_NOISE_STD = 0.05  # Hz — noise added to POMDP observations
+    LINE_NOISE_STD = 0.02  # fraction — noise added to line readings
+    def __init__(self, config: Dict):
+        self.config = config
+        self.num_buses = config['num_buses']
+        self.lines_config = config['lines']
+        self.buses_config = config['buses']
+        # Resolve slack bus from config (not hardcoded to index 0)
+        self.slack_bus_id = next(
+            (b['id'] for b in self.buses_config if b['type'] == 'slack'), 0
+        )
+        self.solver = DCSolver(self.num_buses, slack_bus=self.slack_bus_id)
+        self.timestep = 0
+        self.max_steps = config.get('max_steps', 50)
+        self.bus_state = []
+        self.line_state = []
+        self.cooldowns = {}
+        self.slack_injection = 0.0
+        self._is_blackout = False
+        # Build index dicts for O(1) lookups
+        self._bus_cfg_by_id = {b['id']: b for b in self.buses_config}
+        self._line_cfg_by_id = {l['id']: l for l in self.lines_config}
+        # Multi-agent config
+        self.num_agents = config.get('num_agents', 1)
+        self.zone_assignments = config.get('zone_assignments', {})
+        self.zone_names = config.get('zone_names', [])
+        self.zone_bus_ids = config.get('zone_bus_ids', {})
+        self.internal_lines = config.get('internal_lines', {})
+        self.boundary_lines = config.get('boundary_lines', {})
+        # Safety and oversight (initialized on first multi-agent use)
+        self.safety_layer = SafetyLayer(config)
+        self.oversight_agent = OversightAgent(config)
+        # Episode tracking for multi-agent rewards
+        self._safety_reports_this_step: List[SafetyReport] = []
+        self._oversight_report_this_step: Optional[OversightReport] = None
+        # Calibrate droop constant to system size
+        total_load = sum(
+            b['base_p'] for b in self.buses_config if b['type'] == 'load'
+        )
+        total_gen = sum(
+            b['max_p'] for b in self.buses_config
+            if b['type'] in ['slack', 'generator', 'solar', 'wind']
+        )
+        total_system = max(total_load + total_gen, 50.0)
+        self.droop_constant = 2.5 / total_system
+        # Per-episode RNG — initialized early so _update_loads_and_renewables never crashes
+        self._seed = config.get('seed', 42)
+        self._rng = np.random.default_rng(self._seed)
+    # ======================================================================
+    # State Restoration (for GRPO environment-grounded rewards)
+    # ======================================================================
+    def _set_state(self, obs_dict: dict) -> None:
+        """Restore the environment to a state described by an observation dict.
+        This enables environment-grounded GRPO rewards: instead of scoring
+        actions with a heuristic proxy, we restore the env to the observed state,
+        step with the proposed action, and use the real reward.
+        Args:
+            obs_dict: A dict from ZoneObservation.model_dump() or
+                      GridObservation.model_dump(), containing at minimum:
+                      timestep, grid_frequency, and bus/line state.
+        """
+        self.timestep = obs_dict.get('timestep', 0)
+        self._is_blackout = obs_dict.get('is_blackout', False)
+        self.cooldowns = obs_dict.get('cooldowns', {k: 0 for k in self.cooldowns})
+        # Restore bus state from observation
+        local_buses = obs_dict.get('local_buses', obs_dict.get('buses', []))
+        if local_buses:
+            for b_obs in local_buses:
+                b_dyn = self._find_bus_state(b_obs['id'])
+                if b_dyn is not None:
+                    b_dyn['p'] = b_obs.get('p_injection', b_dyn['p'])
+                    b_dyn['soc'] = b_obs.get('soc', b_dyn.get('soc', 0.0))
+        # Restore line state from observation
+        all_lines = (obs_dict.get('internal_lines', []) or []) + \
+                    (obs_dict.get('boundary_lines', []) or []) + \
+                    (obs_dict.get('lines', []) or [])
+        for l_obs in all_lines:
+            l_dyn = self._find_line(l_obs['id'])
+            if l_dyn is not None:
+                l_dyn['connected'] = l_obs.get('connected', True)
+                l_dyn['flow'] = l_obs.get('flow', 0.0)
+        # Rebuild lookup indices
+        self._bus_state_by_id = {b['id']: b for b in self.bus_state}
+        self._line_state_by_id = {l['id']: l for l in self.line_state}
+        # Re-derive slack injection from frequency if available
+        freq = obs_dict.get('grid_frequency', self.NOMINAL_FREQ)
+        self.slack_injection = (self.NOMINAL_FREQ - freq) / self.droop_constant
+        # Update slack bus p to match
+        slack_dyn = self._find_bus_state(self.slack_bus_id)
+        if slack_dyn is not None:
+            slack_dyn['p'] = self.slack_injection
+    # ======================================================================
+    # Single-Agent API (backward compatible)
+    # ======================================================================
+    def reset(self) -> GridObservation:
+        """Reset the environment to initial state. Returns initial observation."""
+        self.timestep = 0
+        self.slack_injection = 0.0
+        self.cooldowns = {l['id']: 0 for l in self.lines_config}
+        self._rng = np.random.default_rng(self._seed)
+        self.oversight_agent.reset()
+        self.bus_state = []
+        for b in self.buses_config:
+            init_p = 0.0
+            # Initialize generators at 50% capacity so slack doesn't absorb all load
+            if b['type'] in ['generator']:
+                init_p = b['max_p'] * 0.5
+            self.bus_state.append({
+                'id': b['id'], 'p': init_p, 'soc': b.get('init_soc', 0.0)
+            })
+        self.line_state = [
+            {'id': l['id'], 'connected': True, 'flow': 0.0}
+            for l in self.lines_config
+        ]
+        # Build O(1) lookup indices for dynamic state
+        self._bus_state_by_id = {b['id']: b for b in self.bus_state}
+        self._line_state_by_id = {l['id']: l for l in self.line_state}
+        self._is_blackout = False
+        self._update_loads_and_renewables()
+        self._run_power_flow()
+        return self._get_obs()
+    def step(self, action: GridAction) -> Tuple[GridObservation, GridReward, bool, GridInfo]:
+        """Execute one step: apply action, update dynamics, solve physics, compute reward."""
+        self.timestep += 1
+        reward_components = {"survival": 1.0, "frequency": 0.0, "overload": 0.0, "action_cost": 0.0}
+        self._is_blackout = False
+        # 1. Apply topology actions (with cooldown enforcement)
+        for t_act in action.topology_actions:
+            l_id = t_act.line_id
+            if l_id not in self.cooldowns:
+                continue
+            if self.cooldowns[l_id] == 0:
+                line = self._find_line(l_id)
+                if line is None:
+                    continue
+                current_status = line['connected']
+                new_status = (t_act.action == "close")
+                if current_status != new_status:
+                    line['connected'] = new_status
+                    self.cooldowns[l_id] = 3
+                    reward_components['action_cost'] -= 0.5
+        # Tick cooldowns
+        for l_id in self.cooldowns:
+            self.cooldowns[l_id] = max(0, self.cooldowns[l_id] - 1)
+        # 2. Apply power adjustment actions
+        for adj in action.bus_adjustments:
+            bus_cfg = self._find_bus_config(adj.bus_id)
+            bus_dyn = self._find_bus_state(adj.bus_id)
+            if bus_cfg is None or bus_dyn is None:
+                continue
+            delta = adj.delta
+            if bus_cfg['type'] == 'battery':
+                max_charge = bus_cfg['capacity'] - bus_dyn['soc']
+                max_discharge = bus_dyn['soc']
+                if delta > 0:
+                    delta = min(delta, max_discharge)
+                else:
+                    delta = max(delta, -max_charge)
+                bus_dyn['soc'] = np.clip(bus_dyn['soc'] - delta, 0.0, bus_cfg['capacity'])
+                bus_dyn['p'] = delta
+            elif bus_cfg['type'] not in ['load', 'solar', 'wind']:
+                max_ramp = bus_cfg.get('ramp_rate', 10.0)
+                delta = np.clip(delta, -max_ramp, max_ramp)
+                new_p = bus_dyn['p'] + delta
+                bus_dyn['p'] = np.clip(new_p, bus_cfg['min_p'], bus_cfg['max_p'])
+        # 3. Update load/renewable dynamics
+        self._update_loads_and_renewables()
+        # 4. Solve physics
+        try:
+            self._run_power_flow()
+            # Check line overloads
+            for l in self.line_state:
+                if l['connected']:
+                    flow = l['flow']
+                    limit = self._get_line_capacity(l['id'])
+                    rho = abs(flow) / limit if limit > 0 else 0.0
+                    if rho > 1.0:
+                        reward_components['overload'] -= (rho - 1.0) ** 2 * 20
+                    elif rho > 0.8:
+                        reward_components['overload'] -= 0.1
+            # Frequency reward
+            freq = self._compute_frequency()
+            freq_dev = abs(freq - self.NOMINAL_FREQ)
+            if freq_dev > self.FREQ_DEADBAND:
+                raw_penalty = (freq_dev - self.FREQ_DEADBAND) * 0.5
+                reward_components['frequency'] -= min(raw_penalty, 1.5)
+            elif freq_dev < 0.1:
+                reward_components['frequency'] += 0.2
+        except IslandedException:
+            self._is_blackout = True
+            reward_components['survival'] = -100.0
+        done = self._is_blackout or (self.timestep >= self.max_steps)
+        total_reward = sum(reward_components.values())
+        reward = GridReward(value=total_reward, components=reward_components)
+        info = GridInfo(task_id=self.config['id'], is_blackout=self._is_blackout)
+        return self._get_obs(), reward, done, info
+    def state(self) -> GridObservation:
+        """Return current state (alias for observation)."""
+        return self._get_obs()
+    # ======================================================================
+    # Multi-Agent POMDP API
+    # ======================================================================
+    def reset_multi(self) -> Dict[int, ZoneObservation]:
+        """Reset environment and return per-agent partial observations."""
+        self.reset()  # Reuse single-agent reset for state initialization
+        return {
+            agent_id: self._get_zone_obs(agent_id)
+            for agent_id in range(self.num_agents)
+        }
+    def step_multi(self, agent_actions: Dict[int, GridAction]) -> MultiAgentStepResult:
+        """Multi-agent step with safety layer and oversight.
+        Flow:
+        1. Safety layer validates each agent's actions
+        2. Combine corrected actions into one GridAction
+        3. Run single-agent step with combined action
+        4. Oversight agent evaluates coordination
+        5. Compute per-agent rewards (local + global + safety + coordination)
+        """
+        pre_frequency = self._compute_frequency()
+        pre_bus_state = [dict(b) for b in self.bus_state]
+        # --- 1. Safety validation per agent ---
+        safety_reports: Dict[int, SafetyReport] = {}
+        corrected_actions: Dict[int, GridAction] = {}
+        for agent_id in range(self.num_agents):
+            proposed = agent_actions.get(agent_id, GridAction())
+            corrected, report = self.safety_layer.validate_and_correct(
+                agent_id=agent_id,
+                proposed_action=proposed,
+                current_line_state=self.line_state,
+                current_bus_state=self.bus_state,
+                cooldowns=self.cooldowns,
+            )
+            corrected_actions[agent_id] = corrected
+            safety_reports[agent_id] = report
+        self._safety_reports_this_step = safety_reports
+        # --- 2. Combine all corrected actions ---
+        combined = GridAction(
+            bus_adjustments=[
+                adj for action in corrected_actions.values()
+                for adj in action.bus_adjustments
+            ],
+            topology_actions=[
+                t for action in corrected_actions.values()
+                for t in action.topology_actions
+            ],
+        )
+        # --- 3. Run the step ---
+        obs, base_reward, done, info = self.step(combined)
+        post_frequency = self._compute_frequency()
+        # --- 4. Oversight evaluation ---
+        oversight_report = self.oversight_agent.evaluate(
+            agent_actions=agent_actions,
+            safety_reports=safety_reports,
+            pre_frequency=pre_frequency,
+            post_frequency=post_frequency,
+            pre_bus_state=pre_bus_state,
+            post_bus_state=self.bus_state,
+        )
+        self._oversight_report_this_step = oversight_report
+        # --- 5. Per-agent rewards ---
+        per_agent_rewards = {}
+        for agent_id in range(self.num_agents):
+            agent_reward = self._compute_agent_reward(
+                agent_id=agent_id,
+                base_reward=base_reward,
+                safety_report=safety_reports.get(agent_id),
+                oversight_report=oversight_report,
+                is_blackout=info.is_blackout,
+            )
+            per_agent_rewards[agent_id] = agent_reward
+        team_reward = base_reward.value
+        # --- 6. Per-agent partial observations ---
+        per_agent_obs = {
+            agent_id: self._get_zone_obs(agent_id)
+            for agent_id in range(self.num_agents)
+        }
+        # Propagate blackout to observations
+        if info.is_blackout:
+            for obs in per_agent_obs.values():
+                obs.is_blackout = True
+        return MultiAgentStepResult(
+            observations=per_agent_obs,
+            rewards=per_agent_rewards,
+            team_reward=round(team_reward, 4),
+            done=done,
+            safety_reports=safety_reports,
+            oversight_report=oversight_report,
+            info=info,
+        )
+    def get_zone_info(self) -> Dict[int, ZoneInfo]:
+        """Get metadata about each agent's zone."""
+        zones = {}
+        for agent_id in range(self.num_agents):
+            zones[agent_id] = ZoneInfo(
+                agent_id=agent_id,
+                zone_name=self.zone_names[agent_id] if agent_id < len(self.zone_names) else f"Zone_{agent_id}",
+                bus_ids=self.zone_bus_ids.get(agent_id, []),
+                boundary_line_ids=self.boundary_lines.get(agent_id, []),
+                internal_line_ids=self.internal_lines.get(agent_id, []),
+            )
+        return zones
+    # ======================================================================
+    # Multi-Agent Reward Computation
+    # ======================================================================
+    def _compute_agent_reward(
+        self,
+        agent_id: int,
+        base_reward: GridReward,
+        safety_report: Optional[SafetyReport],
+        oversight_report: OversightReport,
+        is_blackout: bool,
+    ) -> GridReward:
+        """Compute per-agent reward with composable components.
+        Components:
+        - survival: shared team component (same for all)
+        - frequency: shared (all agents affected equally)
+        - local_congestion: penalty for overloads in agent's zone
+        - safety_compliance: penalty if safety layer corrected the action
+        - coordination: penalty from oversight for selfish/conflicting behavior
+        - efficiency: small bonus for minimal actions
+        """
+        components = {}
+        # Shared components (from base reward)
+        components['survival'] = base_reward.components.get('survival', 1.0)
+        components['frequency'] = base_reward.components.get('frequency', 0.0)
+        # Global overload shared equally — ensures no line's penalty is lost
+        components['overload_shared'] = base_reward.components.get('overload', 0.0) / max(self.num_agents, 1)
+        # Local congestion: additional penalty for overloads on lines in agent's zone
+        zone_overload = 0.0
+        agent_lines = set(self.internal_lines.get(agent_id, []))
+        agent_lines.update(self.boundary_lines.get(agent_id, []))
+        for l in self.line_state:
+            if l['id'] in agent_lines and l['connected']:
+                limit = self._get_line_capacity(l['id'])
+                rho = abs(l['flow']) / limit if limit > 0 else 0.0
+                if rho > 1.0:
+                    zone_overload -= (rho - 1.0) ** 2 * 10
+                elif rho > 0.8:
+                    zone_overload -= 0.05
+        components['local_congestion'] = zone_overload
+        # Safety compliance penalty
+        if safety_report and safety_report.was_corrected:
+            components['safety_compliance'] = -0.3 * (
+                1 + safety_report.blocked_topology_actions
+            )
+        else:
+            components['safety_compliance'] = 0.1  # Bonus for safe actions
+        # Coordination penalty from oversight
+        coord_penalty = oversight_report.coordination_penalties.get(agent_id, 0.0)
+        components['coordination'] = -coord_penalty
+        # Action cost
+        components['action_cost'] = base_reward.components.get('action_cost', 0.0) / max(self.num_agents, 1)
+        total = sum(components.values())
+        return GridReward(value=round(total, 4), components=components)
+    # ======================================================================
+    # POMDP Observation
+    # ======================================================================
+    def _get_zone_obs(self, agent_id: int) -> ZoneObservation:
+        """Build partial observation for one agent (POMDP).
+        Each agent sees:
+        - Only buses in their zone
+        - Internal + boundary lines
+        - Noisy global frequency
+        - Limited neighbor signals
+        """
+        # Local buses
+        zone_bus_ids = set(self.zone_bus_ids.get(agent_id, []))
+        local_buses = []
+        zone_load = 0.0
+        zone_gen = 0.0
+        for b in self.bus_state:
+            if b['id'] in zone_bus_ids:
+                b_cfg = self._find_bus_config(b['id'])
+                if b_cfg is None:
+                    continue
+                local_buses.append(BusState(
+                    id=b['id'], type=b_cfg['type'],
+                    p_injection=round(b['p'], 4),
+                    soc=round(b.get('soc', 0.0), 4),
+                    ramp_rate=b_cfg.get('ramp_rate', 0.0),
+                ))
+                if b_cfg['type'] == 'load':
+                    zone_load += abs(b['p'])
+                elif b_cfg['type'] in ('generator', 'solar', 'wind', 'slack'):
+                    zone_gen += b['p']
+                # battery: not classified as load or gen
+        # Internal lines (within zone)
+        int_line_ids = set(self.internal_lines.get(agent_id, []))
+        internal_lines = []
+        for l in self.line_state:
+            if l['id'] in int_line_ids:
+                limit = self._get_line_capacity(l['id'])
+                rho = abs(l['flow']) / limit if l['connected'] and limit > 0 else 0.0
+                # Add noise to line readings
+                noisy_rho = rho + self._rng.normal(0, self.LINE_NOISE_STD) if self._rng else rho
+                noisy_rho = max(0.0, noisy_rho)
+                internal_lines.append(LineStatus(
+                    id=l['id'], connected=l['connected'],
+                    flow=round(l['flow'], 4),
+                    rho=round(noisy_rho, 4),
+                ))
+        # Boundary lines (connecting to other zones)
+        bnd_line_ids = set(self.boundary_lines.get(agent_id, []))
+        boundary_lines = []
+        for l in self.line_state:
+            if l['id'] in bnd_line_ids:
+                limit = self._get_line_capacity(l['id'])
+                rho = abs(l['flow']) / limit if l['connected'] and limit > 0 else 0.0
+                noisy_rho = rho + self._rng.normal(0, self.LINE_NOISE_STD) if self._rng else rho
+                noisy_rho = max(0.0, noisy_rho)
+                boundary_lines.append(LineStatus(
+                    id=l['id'], connected=l['connected'],
+                    flow=round(l['flow'], 4),
+                    rho=round(noisy_rho, 4),
+                ))
+        # Noisy frequency (POMDP — agents don't get perfect readings)
+        true_freq = self._compute_frequency()
+        noisy_freq = true_freq + (self._rng.normal(0, self.FREQ_NOISE_STD) if self._rng else 0.0)
+        # Neighbor signals: average bus injection of other zones
+        neighbor_signals = {}
+        for other_id in range(self.num_agents):
+            if other_id == agent_id:
+                continue
+            other_bus_ids = self.zone_bus_ids.get(other_id, [])
+            if other_bus_ids:
+                avg_inj = np.mean([
+                    b['p'] for b in self.bus_state if b['id'] in other_bus_ids
+                ])
+                neighbor_signals[other_id] = round(float(avg_inj), 2)
+        # Cooldowns for lines this agent can see
+        visible_lines = int_line_ids | bnd_line_ids
+        visible_cooldowns = {
+            k: v for k, v in self.cooldowns.items() if k in visible_lines
+        }
+        zone_name = self.zone_names[agent_id] if agent_id < len(self.zone_names) else f"Zone_{agent_id}"
+        return ZoneObservation(
+            agent_id=agent_id,
+            zone_name=zone_name,
+            timestep=self.timestep,
+            grid_frequency=round(noisy_freq, 4),
+            local_buses=local_buses,
+            boundary_lines=boundary_lines,
+            internal_lines=internal_lines,
+            neighbor_signals=neighbor_signals,
+            cooldowns=visible_cooldowns,
+            is_blackout=False,
+            zone_load_mw=round(zone_load, 2),
+            zone_gen_mw=round(zone_gen, 2),
+        )
+    # ======================================================================
+    # Internal Methods (unchanged from original)
+    # ======================================================================
+    def _run_power_flow(self):
+        """Build active line list, solve DC power flow, update line flows and slack injection."""
+        active_lines = []
+        for l_cfg in self.lines_config:
+            l_dyn = self._find_line(l_cfg['id'])
+            if l_dyn and l_dyn['connected']:
+                active_lines.append({
+                    'id': l_cfg['id'], 'from': l_cfg['from'], 'to': l_cfg['to'],
+                    'susceptance': l_cfg['susceptance'], 'connected': True
+                })
+        self.solver.update_grid(active_lines)
+        p_inj = np.zeros(self.num_buses)
+        for b_dyn in self.bus_state:
+            p_inj[b_dyn['id']] = b_dyn['p']
+        theta, flows, slack_inj = self.solver.solve(p_inj)
+        self.slack_injection = slack_inj
+        slack_dyn = self._find_bus_state(self.slack_bus_id)
+        if slack_dyn is not None:
+            slack_dyn['p'] = slack_inj
+        for l in self.line_state:
+            if l['connected'] and l['id'] in flows:
+                l['flow'] = flows[l['id']]
+            elif not l['connected']:
+                l['flow'] = 0.0
+    def _compute_frequency(self) -> float:
+        """Frequency proxy using droop model, calibrated to system size."""
+        return self.NOMINAL_FREQ - self.droop_constant * self.slack_injection
+    def _update_loads_and_renewables(self):
+        """Update time-varying loads and renewable generation. Uses per-episode RNG."""
+        for b_dyn in self.bus_state:
+            b_cfg = self._find_bus_config(b_dyn['id'])
+            if b_cfg is None:
+                continue
+            if b_cfg['type'] == 'load':
+                daily_cycle = math.sin((self.timestep % 24 - 6) * math.pi / 12)
+                b_dyn['p'] = -b_cfg['base_p'] * (0.8 + 0.4 * max(0, daily_cycle))
+            elif b_cfg['type'] == 'solar':
+                solar_cycle = max(0, math.sin((self.timestep % 24 - 6) * math.pi / 12))
+                b_dyn['p'] = b_cfg['max_p'] * solar_cycle
+            elif b_cfg['type'] == 'wind':
+                wind_delta = self._rng.uniform(-5, 5)
+                b_dyn['p'] = float(np.clip(b_dyn['p'] + wind_delta, 0, b_cfg['max_p']))
+    def _get_obs(self) -> GridObservation:
+        """Build observation from current state."""
+        obs_lines = []
+        for l in self.line_state:
+            limit = self._get_line_capacity(l['id'])
+            rho = abs(l['flow']) / limit if l['connected'] and limit > 0 else 0.0
+            obs_lines.append(LineStatus(
+                id=l['id'], connected=l['connected'], flow=round(l['flow'], 4), rho=round(rho, 4)
+            ))
+        obs_buses = []
+        for b in self.bus_state:
+            b_cfg = self._find_bus_config(b['id'])
+            if b_cfg is None:
+                continue
+            obs_buses.append(BusState(
+                id=b['id'], type=b_cfg['type'],
+                p_injection=round(b['p'], 4),
+                soc=round(b.get('soc', 0.0), 4),
+                ramp_rate=b_cfg.get('ramp_rate', 0.0)
+            ))
+        freq = self._compute_frequency()
+        return GridObservation(
+            timestep=self.timestep,
+            grid_frequency=round(freq, 4),
+            buses=obs_buses,
+            lines=obs_lines,
+            cooldowns=self.cooldowns,
+            is_blackout=getattr(self, '_is_blackout', False)
+        )
+    # ---------- Lookup Helpers (O(1) indexed + guarded fallbacks) ----------
+    def _find_line(self, line_id: str):
+        # Use index if available (built in reset), fall back to linear scan
+        idx = getattr(self, '_line_state_by_id', None)
+        if idx is not None:
+            return idx.get(line_id)
+        return next((l for l in self.line_state if l['id'] == line_id), None)
+    def _find_bus_config(self, bus_id: int):
+        return self._bus_cfg_by_id.get(bus_id)
+    def _find_bus_state(self, bus_id: int):
+        idx = getattr(self, '_bus_state_by_id', None)
+        if idx is not None:
+            return idx.get(bus_id)
+        return next((b for b in self.bus_state if b['id'] == bus_id), None)
+    def _get_line_capacity(self, line_id: str) -> float:
+        cfg = self._line_cfg_by_id.get(line_id)
+        return cfg['capacity'] if cfg else 1.0

src/grader.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import logging
+import math
+import numpy as np
+from typing import Dict, Callable, List
+from .environment import OpenGridEnv
+from .models import GridAction, BusAdjustment, TopologyAction
+logger = logging.getLogger(__name__)
+def _random_thrash_policy(obs, rng: np.random.Generator) -> GridAction:
+    """Deliberately bad policy: random topology switching. Used as reward floor.
+    Alternates between opening and closing lines to maximize instability
+    across all steps (not just step 1). Uses an explicit RNG instance
+    (not global np.random) so that floor estimation is reproducible.
+    """
+    top_actions = []
+    for line in obs.lines:
+        if rng.random() > 0.7:
+            action = "open" if line.connected else "close"
+            top_actions.append(TopologyAction(line_id=line.id, action=action))
+    return GridAction(topology_actions=top_actions)
+def compute_analytical_ceiling(max_steps: int) -> float:
+    """Compute the theoretical maximum reward for an episode.
+    Perfect agent: survives every step (+1.0 survival) and achieves
+    tight frequency control bonus (+0.2) every step, with zero overload
+    and zero action cost.
+    ceiling = max_steps * (1.0 + 0.2) = max_steps * 1.2
+    NOTE: The +0.2 frequency bonus requires freq_dev < 0.1 Hz, which needs
+    |P_slack| < 0.04 * S_total (from droop model). On high-renewable tasks
+    (task_hard) where slack routinely absorbs >50 MW of imbalance, this band
+    may be structurally inaccessible. The effective ceiling on such tasks is
+    closer to max_steps * 1.0 = 50.0. Scores remain comparable across agents
+    on the same task — the ceiling just compresses the achievable range.
+    """
+    return max_steps * 1.2
+# Validator requires scores strictly in the open interval (0, 1).
+# Using wide epsilon so that even aggressive rounding (e.g. round(x, 1))
+# can never produce exactly 0.0 or 1.0.
+_SCORE_EPSILON = 0.02
+_SCORE_MIN = _SCORE_EPSILON        # 0.02
+_SCORE_MAX = 1.0 - _SCORE_EPSILON  # 0.98
+def _safe_float(x: float) -> float:
+    """Convert to plain Python float; replace NaN/Inf with midpoint."""
+    v = float(x)
+    if not math.isfinite(v):
+        return 0.5  # safe fallback inside (0, 1)
+    return v
+def _clamp_score(score: float) -> float:
+    """Clamp a score to the open interval (0, 1) using Python-native min/max.
+    This avoids any numpy-scalar serialisation quirks and guarantees a plain
+    Python float that JSON-encodes to a normal number.
+    """
+    score = _safe_float(score)
+    score = max(_SCORE_MIN, min(_SCORE_MAX, score))
+    # Truncate (not round) to 4 decimal places to avoid
+    # round(0.98500…, 4) == 0.985 becoming 0.99 after further rounding.
+    score = math.floor(score * 10000) / 10000
+    # Final safety: ensure truncation didn't land on a boundary
+    score = max(_SCORE_MIN, min(_SCORE_MAX, score))
+    return score
+def normalize_score(cumulative_reward: float, reward_floor: float, reward_ceiling: float,
+                    n1_survival_rate: float = 1.0) -> float:
+    """
+    Shared normalization: maps raw cumulative reward to the open interval (0, 1).
+    Used by both /grader endpoint and RobustnessGrader for consistency.
+    - reward_floor: empirical worst-case (random thrashing policy, seeded RNG)
+    - reward_ceiling: analytical upper bound (perfect survival + perfect frequency bonus)
+    - n1_survival_rate: fraction of episodes without blackout (adds up to 10% bonus)
+    Scores are clamped to [0.02, 0.98] so they are never exactly 0.0 or 1.0,
+    and cannot round to those values, satisfying the OpenEnv Phase-2 validator.
+    """
+    raw_range = _safe_float(reward_ceiling) - _safe_float(reward_floor)
+    if raw_range < 1.0:
+        raw_range = 1.0  # Prevent division by near-zero
+    cumulative_reward = _safe_float(cumulative_reward)
+    normalized = (cumulative_reward - _safe_float(reward_floor)) / raw_range
+    # N-1 bonus: up to 10% boost for surviving without blackout
+    # Scale into available headroom so top performers still differentiate
+    n1_bonus = float(n1_survival_rate) * 0.1
+    available = _SCORE_MAX - normalized
+    if available > 0:
+        n1_bonus = min(n1_bonus, available * 0.5)
+    else:
+        n1_bonus = 0.0
+    score = normalized + n1_bonus
+    return _clamp_score(score)
+class RobustnessGrader:
+    """
+    Evaluates a policy's performance on an OpenGrid task.
+    Scoring:
+    - Floor: empirical estimate from adversarial random topology thrashing
+      (seeded RNG for reproducibility, n_samples=10 for stability)
+    - Ceiling: analytical upper bound = max_steps * 1.2
+      (perfect survival + perfect frequency bonus every step)
+    - Normalizes cumulative reward to 0.0–1.0
+    - Adds N-1 survival bonus (max 10%)
+    The heuristic baseline scores ~0.75–0.90, leaving headroom for
+    agents that employ active topology management and predictive scheduling.
+    """
+    def __init__(self, config: Dict):
+        self.config = config
+        self.reward_floor = None
+        self.reward_ceiling = None
+    def _estimate_bounds(self, n_samples: int = 10):
+        """Estimate reward bounds.
+        Floor: adversarial random thrashing policy (empirical, seeded).
+        Ceiling: analytical upper bound (deterministic).
+        n_samples=10 to reduce variance in the floor estimate.
+        The floor uses mean - std to be conservatively low.
+        Each episode gets its own thrash RNG derived from a master seed
+        so that changing n_samples doesn't alter existing episodes.
+        """
+        master_rng = np.random.default_rng(seed=12345)
+        floors = []
+        base_seed = self.config.get('seed', 42)
+        for i in range(n_samples):
+            # Per-episode thrash RNG — decoupled from other episodes
+            thrash_rng = np.random.default_rng(seed=int(master_rng.integers(0, 2**31)))
+            # Vary environment seed so floor reflects environment stochasticity
+            config_with_seed = {**self.config, 'seed': base_seed + i}
+            env = OpenGridEnv(config_with_seed)
+            obs = env.reset()
+            done = False
+            ep_reward = 0
+            while not done:
+                action = _random_thrash_policy(obs, rng=thrash_rng)
+                obs, reward, done, info = env.step(action)
+                ep_reward += reward.value
+            floors.append(ep_reward)
+        self.reward_floor = float(np.mean(floors) - np.std(floors))
+        logger.debug("Floor estimate: mean=%.2f, std=%.2f, floor=%.2f",
+                     np.mean(floors), np.std(floors), self.reward_floor)
+        # Ceiling: analytical upper bound (not heuristic)
+        max_steps = self.config.get('max_steps', 50)
+        analytical_ceiling = compute_analytical_ceiling(max_steps)
+        self.reward_ceiling = analytical_ceiling
+        # Ensure minimum spread — expand floor downward, not ceiling upward
+        if self.reward_ceiling - self.reward_floor < 10.0:
+            self.reward_floor = self.reward_ceiling - max(10.0, analytical_ceiling * 0.2)
+            logger.debug("Spread too small, adjusted floor to %.2f", self.reward_floor)
+    def get_bounds(self) -> Dict[str, float]:
+        """Return the reward floor and ceiling, computing if needed."""
+        if self.reward_floor is None:
+            self._estimate_bounds()
+        return {"reward_floor": self.reward_floor, "reward_ceiling": self.reward_ceiling}
+    def evaluate_policy(self, policy_fn: Callable, n_episodes: int = 10) -> Dict:
+        """Run a policy for n_episodes and return normalized score.
+        Each episode uses a different environment seed (offset by 1000 from
+        floor estimation seeds) to measure policy robustness across diverse
+        wind/load trajectories.
+        """
+        if self.reward_floor is None:
+            self._estimate_bounds()
+        base_seed = self.config.get('seed', 42)
+        rewards = []
+        n1_survivals = 0
+        for ep in range(n_episodes):
+            # Offset by 1000 to avoid overlap with floor estimation seeds
+            config_with_seed = {**self.config, 'seed': base_seed + ep + 1000}
+            env = OpenGridEnv(config_with_seed)
+            obs = env.reset()
+            done = False
+            ep_reward = 0
+            while not done:
+                action = policy_fn(obs)
+                obs, reward, done, info = env.step(action)
+                ep_reward += reward.value
+            rewards.append(ep_reward)
+            if not info.is_blackout:
+                n1_survivals += 1
+        avg_reward = float(np.mean(rewards))
+        n1_rate = n1_survivals / n_episodes
+        logger.debug("Policy eval: avg=%.2f, n1_rate=%.2f, episodes=%d",
+                     avg_reward, n1_rate, n_episodes)
+        final_score = normalize_score(
+            cumulative_reward=avg_reward,
+            reward_floor=self.reward_floor,
+            reward_ceiling=self.reward_ceiling,
+            n1_survival_rate=n1_rate
+        )
+        return {
+            "avg_raw_reward": round(avg_reward, 4),
+            "n1_survival_rate": round(n1_rate, 4),
+            "reward_floor": round(self.reward_floor, 4),
+            "reward_ceiling": round(self.reward_ceiling, 4),
+            "score": final_score
+        }

src/models.py ADDED Viewed

	@@ -0,0 +1,162 @@

+from typing import List, Dict, Literal, Optional
+from pydantic import BaseModel, Field
+class TopologyAction(BaseModel):
+    """A topology switching action on a transmission line."""
+    line_id: str
+    action: Literal["open", "close"]
+class BusAdjustment(BaseModel):
+    """A power injection adjustment on a bus."""
+    bus_id: int
+    delta: float  # MW change (positive = inject more)
+class GridAction(BaseModel):
+    """Agent action: adjust bus injections and/or switch line topology."""
+    bus_adjustments: List[BusAdjustment] = []
+    topology_actions: List[TopologyAction] = []
+class LineStatus(BaseModel):
+    """Current state of a transmission line."""
+    id: str
+    connected: bool
+    flow: float = 0.0
+    rho: float = Field(0.0, ge=0.0, description="Loading percentage (flow/capacity)")
+class BusState(BaseModel):
+    """Current state of a bus (generator, load, battery, or renewable)."""
+    id: int
+    type: Literal["slack", "generator", "load", "battery", "solar", "wind"]
+    p_injection: float
+    soc: float = Field(0.0, ge=0.0, description="State of charge (MWh)")
+    ramp_rate: float = 0.0
+class GridObservation(BaseModel):
+    """Full grid observation returned by reset()/step()/state()."""
+    timestep: int
+    grid_frequency: float
+    buses: List[BusState]
+    lines: List[LineStatus]
+    cooldowns: Dict[str, int]
+    is_blackout: bool = False
+    def __repr__(self) -> str:
+        return (
+            f"GridObservation(t={self.timestep}, f={self.grid_frequency:.2f}, "
+            f"buses={len(self.buses)}, lines={len(self.lines)}, "
+            f"blackout={self.is_blackout})"
+        )
+class GridReward(BaseModel):
+    """Reward signal with component breakdown."""
+    value: float
+    components: Dict[str, float]
+class GridInfo(BaseModel):
+    """Episode info (metadata alongside reward)."""
+    task_id: str
+    is_blackout: bool
+# ---------------------------------------------------------------------------
+# Multi-Agent POMDP Models
+# ---------------------------------------------------------------------------
+class ZoneInfo(BaseModel):
+    """Metadata about an agent's zone."""
+    agent_id: int
+    zone_name: str
+    bus_ids: List[int]
+    boundary_line_ids: List[str]
+    internal_line_ids: List[str]
+class ZoneObservation(BaseModel):
+    """Partial observation for one agent under POMDP.
+    Each agent sees only:
+    - Their local buses (within their zone)
+    - Boundary lines (connecting to other zones)
+    - Internal lines (within their zone)
+    - A noisy estimate of global grid frequency
+    - Limited communication signals from neighboring agents
+    """
+    agent_id: int
+    zone_name: str
+    timestep: int
+    grid_frequency: float  # noisy — Gaussian noise added
+    local_buses: List[BusState]
+    boundary_lines: List[LineStatus]
+    internal_lines: List[LineStatus]
+    neighbor_signals: Dict[int, float] = Field(
+        default_factory=dict,
+        description="Limited info from other agents: {agent_id: their avg bus injection}"
+    )
+    cooldowns: Dict[str, int] = Field(default_factory=dict)
+    is_blackout: bool = False
+    zone_load_mw: float = 0.0
+    zone_gen_mw: float = 0.0
+    def __repr__(self) -> str:
+        return (
+            f"ZoneObservation(agent={self.agent_id}, zone={self.zone_name}, "
+            f"t={self.timestep}, f={self.grid_frequency:.2f}, "
+            f"buses={len(self.local_buses)}, blackout={self.is_blackout})"
+        )
+class SafetyReport(BaseModel):
+    """Report from the safety layer about action corrections."""
+    agent_id: int
+    was_corrected: bool
+    correction_reason: str = ""
+    n1_violations_detected: int = 0
+    proposed_topology_actions: int = 0
+    blocked_topology_actions: int = 0
+    original_total_delta_mw: float = 0.0
+    corrected_total_delta_mw: float = 0.0
+class OversightReport(BaseModel):
+    """Report from the oversight agent about multi-agent coordination."""
+    coordination_score: float = Field(
+        1.0, description="1.0 = perfect cooperation, 0.0 = total conflict"
+    )
+    conflicting_actions_detected: int = 0
+    selfish_actions_detected: int = 0
+    coordination_penalties: Dict[int, float] = Field(default_factory=dict)
+    global_frequency_contribution: Dict[int, float] = Field(
+        default_factory=dict,
+        description="Each agent's net impact on frequency deviation"
+    )
+    notes: List[str] = Field(default_factory=list)
+class MultiAgentAction(BaseModel):
+    """Request body for /step_multi: per-agent actions keyed by agent_id."""
+    agent_actions: Dict[int, GridAction] = Field(
+        default_factory=dict,
+        description="Actions for each agent, keyed by agent_id"
+    )
+class MultiAgentStepResult(BaseModel):
+    """Result of a multi-agent step — per-agent observations, rewards, reports."""
+    observations: Dict[int, ZoneObservation]
+    rewards: Dict[int, GridReward]
+    team_reward: float
+    done: bool
+    safety_reports: Dict[int, SafetyReport] = Field(
+        default_factory=dict,
+        description="Per-agent safety reports, keyed by agent_id"
+    )
+    oversight_report: OversightReport
+    info: GridInfo

src/oversight.py ADDED Viewed

	@@ -0,0 +1,190 @@

+"""
+Oversight Agent — Multi-Agent Coordination Monitor
+===================================================
+A rule-based meta-agent that monitors coordination quality across zones.
+Responsibilities:
+1. Detect conflicting actions (agents pulling frequency opposite ways)
+2. Detect selfish behavior (local improvement at global cost)
+3. Assign coordination penalties to agents
+4. Track safety layer intervention frequency
+This is NOT a trained agent — it's a deterministic rule engine that
+provides additional reward signal to guide multi-agent learning.
+References:
+- Symphony: Multi-Agent Intelligence in a Collective Fabric (Gradient, 2025)
+- Massgen: When Multiple LLMs Think Together (Gradient, 2025)
+"""
+import logging
+import math
+from typing import Dict, List
+from .models import GridAction, SafetyReport, OversightReport
+logger = logging.getLogger(__name__)
+class OversightAgent:
+    """Rule-based oversight agent for multi-agent coordination.
+    Sits above zone agents and evaluates whether their combined actions
+    are globally beneficial or harmful. Produces an OversightReport
+    with coordination scores and penalties.
+    """
+    def __init__(self, config: Dict):
+        self.config = config
+        self.zone_assignments = config.get('zone_assignments', {})
+        self.num_agents = config.get('num_agents', 1)
+        self.intervention_history: Dict[int, int] = {
+            i: 0 for i in range(self.num_agents)
+        }
+    def evaluate(
+        self,
+        agent_actions: Dict[int, GridAction],
+        safety_reports: Dict[int, SafetyReport],
+        pre_frequency: float,
+        post_frequency: float,
+        pre_bus_state: List[Dict],
+        post_bus_state: List[Dict],
+    ) -> OversightReport:
+        """Evaluate multi-agent coordination quality.
+        Args:
+            agent_actions: {agent_id: GridAction} — proposed actions
+            safety_reports: {agent_id: SafetyReport} — per-agent safety results
+            pre_frequency: Grid frequency before this step
+            post_frequency: Grid frequency after this step
+            pre_bus_state: Bus states before actions
+            post_bus_state: Bus states after actions
+        Returns:
+            OversightReport with scores, penalties, and notes
+        """
+        notes = []
+        penalties: Dict[int, float] = {i: 0.0 for i in range(self.num_agents)}
+        conflicts = 0
+        selfish_count = 0
+        # --- 1. Track safety interventions ---
+        for agent_id, report in safety_reports.items():
+            # Validate agent_id is within expected range
+            if agent_id not in self.intervention_history:
+                notes.append(f"WARNING: unknown agent_id {agent_id} in safety report")
+                continue
+            if report.was_corrected:
+                self.intervention_history[agent_id] += 1
+                # Penalty scales with repeated violations
+                repeat_count = self.intervention_history[agent_id]
+                penalties[agent_id] += 0.1 * min(repeat_count, 5)
+                notes.append(
+                    f"Agent {agent_id}: safety correction #{repeat_count}"
+                )
+        # --- 2. Detect conflicting frequency actions ---
+        # If agents are pushing frequency in opposite directions, that's waste
+        net_deltas = {}
+        for agent_id, action in agent_actions.items():
+            total_delta = sum(a.delta for a in action.bus_adjustments)
+            n_topo = len(action.topology_actions)
+            if n_topo > 0:
+                notes.append(
+                    f"Agent {agent_id}: {n_topo} topology action(s) "
+                    f"not included in conflict analysis"
+                )
+            net_deltas[agent_id] = total_delta
+        if len(net_deltas) >= 2:
+            deltas = list(net_deltas.values())
+            # Check if some agents inject and others withdraw significantly
+            injectors = [d for d in deltas if d > 2.0]
+            withdrawers = [d for d in deltas if d < -2.0]
+            if injectors and withdrawers:
+                conflicts += 1
+                notes.append(
+                    "Conflicting actions: some agents inject while others withdraw"
+                )
+                # Penalize the agent pushing AGAINST the needed direction
+                freq_error = 50.0 - pre_frequency
+                if abs(freq_error) > 0.1:
+                    # Clear direction needed — penalize the opposing side
+                    for agent_id, delta in net_deltas.items():
+                        # If freq < 50 (need more injection) but agent withdraws
+                        if freq_error > 0.1 and delta < -2.0:
+                            penalties[agent_id] += 0.2
+                            selfish_count += 1
+                            notes.append(
+                                f"Agent {agent_id}: withdrew {delta:.1f} MW "
+                                f"when grid needed injection"
+                            )
+                        # If freq > 50 (need less injection) but agent injects
+                        elif freq_error < -0.1 and delta > 2.0:
+                            penalties[agent_id] += 0.2
+                            selfish_count += 1
+                            notes.append(
+                                f"Agent {agent_id}: injected {delta:.1f} MW "
+                                f"when grid had excess"
+                            )
+                else:
+                    # Near-nominal: penalize all significant participants equally
+                    for agent_id, delta in net_deltas.items():
+                        if abs(delta) > 2.0:
+                            penalties[agent_id] += 0.1
+                            notes.append(
+                                f"Agent {agent_id}: conflicting injection "
+                                f"({delta:+.1f} MW) with no clear grid need"
+                            )
+        # --- 3. Evaluate frequency impact per agent ---
+        freq_contribution: Dict[int, float] = {}
+        freq_dev_before = abs(pre_frequency - 50.0)
+        freq_dev_after = abs(post_frequency - 50.0)
+        freq_improved = freq_dev_after < freq_dev_before
+        for agent_id in range(self.num_agents):
+            # Net MW delta (not frequency impact — would need droop constant)
+            total_delta = net_deltas.get(agent_id, 0.0)
+            freq_contribution[agent_id] = round(total_delta, 4)
+        # --- 4. Compute coordination score ---
+        # Sub-linear scaling: diminishing penalty per additional incident
+        # prevents score from collapsing to 0.0 for mildly bad teams
+        safety_corrections = sum(
+            1 for r in safety_reports.values() if r.was_corrected
+        )
+        conflict_penalty = 1.0 - math.exp(-conflicts * 0.3)
+        selfish_penalty = 1.0 - math.exp(-selfish_count * 0.2)
+        safety_penalty = 1.0 - math.exp(-safety_corrections * 0.2)
+        base_score = (1.0
+                      - 0.4 * conflict_penalty
+                      - 0.3 * selfish_penalty
+                      - 0.3 * safety_penalty)
+        # Frequency improvement bonus / degradation penalty
+        if freq_improved:
+            base_score += 0.1
+        else:
+            degradation = freq_dev_after - freq_dev_before
+            base_score -= min(degradation * 0.5, 0.2)
+        coordination_score = max(0.0, min(1.0, base_score))
+        return OversightReport(
+            coordination_score=round(coordination_score, 4),
+            conflicting_actions_detected=conflicts,
+            selfish_actions_detected=selfish_count,
+            coordination_penalties=penalties,
+            global_frequency_contribution=freq_contribution,
+            notes=notes,
+        )
+    def reset(self):
+        """Reset intervention history for a new episode."""
+        self.intervention_history = {
+            i: 0 for i in range(self.num_agents)
+        }

src/physics.py ADDED Viewed

	@@ -0,0 +1,172 @@

+"""
+DC Power Flow Solver
+====================
+Implements the standard DC approximation: B * θ = P
+Assumptions:
+- Flat voltage profile (|V| ≈ 1.0 p.u.)
+- Small angle differences (sin(θ) ≈ θ)
+- Negligible resistance (R ≈ 0, only susceptance used)
+Flow sign convention:
+    flow = b * (θ_from - θ_to)
+    Positive flow = power flowing from 'from' bus to 'to' bus.
+"""
+import logging
+import warnings
+import numpy as np
+from typing import List, Dict, Tuple
+logger = logging.getLogger(__name__)
+class IslandedException(Exception):
+    pass
+class DCSolver:
+    """DC power flow solver with graph-based islanding detection.
+    The slack bus absorbs any power imbalance and has its voltage angle
+    fixed to 0 (reference). By default this is bus 0, but can be
+    configured via the slack_bus parameter.
+    """
+    def __init__(self, num_buses: int, slack_bus: int = 0):
+        self.num_buses = num_buses
+        self.slack_bus = slack_bus
+        self.B = np.zeros((num_buses, num_buses))
+        self.line_map = {}
+        self._grid_loaded = False
+    def update_grid(self, lines: List[Dict]):
+        """Rebuild the B matrix and check connectivity.
+        Skips zero-susceptance lines (no electrical contribution).
+        Validates bus indices to prevent silent corruption.
+        """
+        self.B = np.zeros((self.num_buses, self.num_buses))
+        self.line_map = {}
+        # Union-Find for O(n) connectivity check (replaces NetworkX)
+        parent = list(range(self.num_buses))
+        rank = [0] * self.num_buses
+        def find(x):
+            while parent[x] != x:
+                parent[x] = parent[parent[x]]  # path compression
+                x = parent[x]
+            return x
+        def union(x, y):
+            rx, ry = find(x), find(y)
+            if rx == ry:
+                return
+            if rank[rx] < rank[ry]:
+                rx, ry = ry, rx
+            parent[ry] = rx
+            if rank[rx] == rank[ry]:
+                rank[rx] += 1
+        for line in lines:
+            if line['connected']:
+                i, j = line['from'], line['to']
+                b = line['susceptance']
+                # Validate bus indices
+                if not (0 <= i < self.num_buses and 0 <= j < self.num_buses):
+                    raise ValueError(
+                        f"Line {line['id']}: bus indices ({i}, {j}) out of range "
+                        f"for {self.num_buses} buses"
+                    )
+                # Skip zero-susceptance lines (no electrical contribution)
+                if abs(b) < 1e-12:
+                    continue
+                self.B[i, j] -= b
+                self.B[j, i] -= b
+                self.B[i, i] += b
+                self.B[j, j] += b
+                self.line_map[line['id']] = (i, j, b)
+                union(i, j)
+        # Connectivity check via union-find
+        root = find(0)
+        if not all(find(i) == root for i in range(self.num_buses)):
+            # Build component info for diagnostics
+            components = {}
+            for i in range(self.num_buses):
+                r = find(i)
+                components.setdefault(r, []).append(i)
+            comp_sizes = [len(c) for c in components.values()]
+            raise IslandedException(
+                f"Grid is islanded: {len(components)} components, "
+                f"sizes={comp_sizes}"
+            )
+        self._grid_loaded = True
+    def solve(self, p_inj: np.ndarray) -> Tuple[np.ndarray, Dict[str, float], float]:
+        """Solve DC power flow: B_red * θ_red = P_red.
+        Args:
+            p_inj: Real power injection at each bus (MW). Shape must be (num_buses,).
+        Returns:
+            (theta, line_flows, slack_injection) tuple.
+            theta: voltage angles (radians). Slack bus angle = 0.
+            line_flows: {line_id: flow_MW}. Positive = from→to direction.
+            slack_injection: MW absorbed/injected by the slack bus.
+        """
+        if not self._grid_loaded:
+            raise RuntimeError("DCSolver.solve() called before update_grid()")
+        # Validate input
+        p_inj = np.asarray(p_inj).ravel()
+        if len(p_inj) != self.num_buses:
+            raise ValueError(
+                f"p_inj length {len(p_inj)} != num_buses {self.num_buses}"
+            )
+        # Remove slack bus row/column
+        mask = np.arange(self.num_buses) != self.slack_bus
+        B_red = self.B[np.ix_(mask, mask)]
+        p_red = p_inj[mask]
+        try:
+            theta_red = np.linalg.solve(B_red, p_red)
+        except np.linalg.LinAlgError:
+            raise IslandedException("Grid is islanded (singular B matrix)")
+        # Check conditioning
+        cond = np.linalg.cond(B_red)
+        if cond > 1e12:
+            warnings.warn(
+                f"DCSolver: B_red is ill-conditioned (cond={cond:.2e}). "
+                f"Results may be numerically unreliable.",
+                RuntimeWarning,
+                stacklevel=2,
+            )
+        # Insert slack bus angle (= 0)
+        theta = np.zeros(self.num_buses)
+        theta[mask] = theta_red
+        # Compute line flows
+        flows = {}
+        for line_id, (i, j, b) in self.line_map.items():
+            flows[line_id] = (theta[i] - theta[j]) * b
+        # Slack injection from power balance (more robust than summing flows)
+        slack_injection = -float(p_inj[mask].sum())
+        return theta, flows, slack_injection
+    def __repr__(self):
+        return (
+            f"DCSolver(num_buses={self.num_buses}, slack={self.slack_bus}, "
+            f"lines={len(self.line_map)}, loaded={self._grid_loaded})"
+        )

src/safety.py ADDED Viewed

	@@ -0,0 +1,316 @@

+"""
+Safety Layer — Hard Constraint Filter for OpenGrid
+===================================================
+Validates agent actions BEFORE they are applied to the environment.
+If constraints are violated, actions are projected to the nearest safe alternative.
+This is the core safety innovation: constraint violations should NEVER
+reach the physics engine. The safety layer catches them first.
+Checks:
+1. Anti-Islanding: topology actions that would disconnect the grid are blocked
+2. N-1 Security: for each critical line, simulate failure → check grid survives
+3. Generation Limits: bus adjustments respect ramp rates and capacity
+4. Zone Boundary: agents can only adjust buses in their assigned zone
+References:
+- KPTCL N-1 security criterion (Indian Grid Code, IEGC)
+- Control Barrier Functions for safe RL (Ames et al., 2019)
+"""
+import logging
+import networkx as nx
+import numpy as np
+from typing import List, Dict, Tuple
+from .models import GridAction, BusAdjustment, TopologyAction, SafetyReport
+logger = logging.getLogger(__name__)
+class SafetyLayer:
+    """Hard constraint filter that validates and corrects agent actions.
+    The safety layer sits between agents and the environment:
+        Agent proposes action → SafetyLayer validates → corrected action → Environment
+    If an action would cause a constraint violation, it is PROJECTED to the
+    nearest safe alternative (not just rejected). This preserves the agent's
+    intent while enforcing safety, and provides a richer training signal
+    than binary accept/reject.
+    """
+    def __init__(self, config: Dict):
+        self.config = config
+        self.num_buses = config['num_buses']
+        self.lines_config = config['lines']
+        self.buses_config = config['buses']
+        self.zone_assignments = config.get('zone_assignments', {})
+        self.zone_enforcement = bool(self.zone_assignments)
+        # Build config index for O(1) lookups
+        self._bus_cfg_by_id = {b['id']: b for b in self.buses_config}
+    def validate_and_correct(
+        self,
+        agent_id: int,
+        proposed_action: GridAction,
+        current_line_state: List[Dict],
+        current_bus_state: List[Dict],
+        cooldowns: Dict[str, int],
+    ) -> Tuple[GridAction, SafetyReport]:
+        """Full validation pipeline for one agent's proposed action.
+        Returns:
+            corrected_action: Safe version of the proposed action
+            report: Details of what was checked and corrected
+        """
+        corrections = []
+        n1_violations = 0
+        # Track original action stats
+        original_delta = sum(abs(a.delta) for a in proposed_action.bus_adjustments)
+        proposed_topo_count = len(proposed_action.topology_actions)
+        blocked_topo_count = 0
+        # Build bus state index for O(1) lookups
+        bus_dyn_by_id = {b['id']: b for b in current_bus_state}
+        # --- 1. Zone boundary enforcement ---
+        safe_bus_adj = []
+        for adj in proposed_action.bus_adjustments:
+            bus_zone = self.zone_assignments.get(adj.bus_id, -1)
+            if not self.zone_enforcement or bus_zone == agent_id:
+                # Agent owns this bus, or single-agent mode
+                safe_bus_adj.append(adj)
+            else:
+                corrections.append(
+                    f"Blocked bus {adj.bus_id} adjustment: "
+                    f"belongs to zone {bus_zone}, not agent {agent_id}"
+                )
+        # --- 2. Generation limit enforcement ---
+        # Aggregate adjustments per bus to prevent double-spending
+        bus_deltas: Dict[int, float] = {}
+        for adj in safe_bus_adj:
+            bus_deltas[adj.bus_id] = bus_deltas.get(adj.bus_id, 0.0) + adj.delta
+        clamped_bus_adj = []
+        for bus_id, total_delta in bus_deltas.items():
+            bus_cfg = self._bus_cfg_by_id.get(bus_id)
+            bus_dyn = bus_dyn_by_id.get(bus_id)
+            if bus_cfg is None or bus_dyn is None:
+                corrections.append(f"Blocked bus {bus_id}: not found")
+                continue
+            delta = total_delta
+            bus_type = bus_cfg['type']
+            # Loads and renewables can't be directly adjusted
+            if bus_type in ['load', 'solar', 'wind']:
+                corrections.append(
+                    f"Blocked bus {bus_id}: type '{bus_type}' is not controllable"
+                )
+                continue
+            # Enforce ramp rate
+            max_ramp = bus_cfg.get('ramp_rate', 20.0)
+            if abs(delta) > max_ramp:
+                delta = np.clip(delta, -max_ramp, max_ramp)
+                corrections.append(
+                    f"Clamped bus {bus_id} delta to ramp rate ±{max_ramp}"
+                )
+            # Enforce battery SoC limits
+            if bus_type == 'battery':
+                soc = bus_dyn.get('soc', 0.0)
+                capacity = bus_cfg.get('capacity', 50.0)
+                if delta > 0 and delta > soc:
+                    delta = soc
+                    corrections.append(
+                        f"Clamped bus {bus_id} discharge to SoC={soc:.1f}"
+                    )
+                elif delta < 0 and abs(delta) > (capacity - soc):
+                    delta = -(capacity - soc)
+                    corrections.append(
+                        f"Clamped bus {bus_id} charge to remaining capacity"
+                    )
+            # Enforce generator limits
+            # NOTE: This is a best-effort projection based on pre-step state.
+            # If multiple agents adjust the same bus via different zones,
+            # the environment provides a secondary clamp.
+            if bus_type in ['slack', 'generator']:
+                current_p = bus_dyn.get('p', 0.0)
+                new_p = current_p + delta
+                min_p = bus_cfg.get('min_p', -50)
+                max_p = bus_cfg.get('max_p', 100)
+                if new_p < min_p or new_p > max_p:
+                    new_p = np.clip(new_p, min_p, max_p)
+                    delta = new_p - current_p
+                    corrections.append(
+                        f"Clamped bus {bus_id} to generation limits "
+                        f"[{min_p}, {max_p}]"
+                    )
+            clamped_bus_adj.append(BusAdjustment(bus_id=bus_id, delta=delta))
+        # --- 3. Topology safety (anti-islanding + N-1) ---
+        # Build base graph once for all topology checks
+        base_graph = self._build_connectivity_graph(current_line_state)
+        safe_topo = []
+        approved_opens: set = set()  # Track approved opens for cumulative check
+        for t_act in proposed_action.topology_actions:
+            line_id = t_act.line_id
+            # Check cooldown
+            if cooldowns.get(line_id, 0) > 0:
+                corrections.append(
+                    f"Blocked {line_id} switch: cooldown active "
+                    f"({cooldowns[line_id]} steps)"
+                )
+                blocked_topo_count += 1
+                continue
+            # Check if opening this line would island the grid
+            # (cumulative: checks against already-approved opens)
+            if t_act.action == "open":
+                if self._would_island(
+                    line_id, base_graph, additional_opens=approved_opens
+                ):
+                    corrections.append(
+                        f"Blocked opening {line_id}: would island the grid"
+                    )
+                    blocked_topo_count += 1
+                    n1_violations += 1
+                    continue
+                approved_opens.add(line_id)
+            safe_topo.append(t_act)
+        # --- 4. N-1 check on final combined action ---
+        if safe_topo:
+            n1_fails = self._check_n1_post_action(safe_topo, current_line_state)
+            if n1_fails > 0:
+                n1_violations += n1_fails
+                corrections.append(
+                    f"N-1 warning: {n1_fails} lines would leave grid "
+                    f"vulnerable after action"
+                )
+        corrected_action = GridAction(
+            bus_adjustments=clamped_bus_adj,
+            topology_actions=safe_topo
+        )
+        corrected_delta = sum(abs(a.delta) for a in clamped_bus_adj)
+        was_corrected = len(corrections) > 0
+        report = SafetyReport(
+            agent_id=agent_id,
+            was_corrected=was_corrected,
+            correction_reason="; ".join(corrections) if corrections else "",
+            n1_violations_detected=n1_violations,
+            proposed_topology_actions=proposed_topo_count,
+            blocked_topology_actions=blocked_topo_count,
+            original_total_delta_mw=round(original_delta, 4),
+            corrected_total_delta_mw=round(corrected_delta, 4),
+        )
+        return corrected_action, report
+    def _build_connectivity_graph(
+        self, current_line_state: List[Dict]
+    ) -> nx.Graph:
+        """Build the connectivity graph from current line state (once)."""
+        G = nx.Graph()
+        G.add_nodes_from(range(self.num_buses))
+        line_dyn_by_id = {l['id']: l for l in current_line_state}
+        for l_cfg in self.lines_config:
+            l_dyn = line_dyn_by_id.get(l_cfg['id'])
+            if l_dyn is not None and l_dyn.get('connected', True):
+                G.add_edge(l_cfg['from'], l_cfg['to'])
+        return G
+    def _would_island(
+        self,
+        line_id: str,
+        base_graph: nx.Graph,
+        additional_opens: set = None,
+    ) -> bool:
+        """Check if opening a line would disconnect the grid.
+        Takes cumulative approved opens into account so that
+        multiple simultaneous opens are correctly checked.
+        """
+        additional_opens = additional_opens or set()
+        # Find the edge for this line
+        line_cfg = next(
+            (l for l in self.lines_config if l['id'] == line_id), None
+        )
+        if line_cfg is None:
+            return False
+        # Build a test graph with all proposed removals
+        G = base_graph.copy()
+        # Remove previously approved opens
+        for open_id in additional_opens:
+            open_cfg = next(
+                (l for l in self.lines_config if l['id'] == open_id), None
+            )
+            if open_cfg and G.has_edge(open_cfg['from'], open_cfg['to']):
+                G.remove_edge(open_cfg['from'], open_cfg['to'])
+        # Remove the line under test
+        if G.has_edge(line_cfg['from'], line_cfg['to']):
+            G.remove_edge(line_cfg['from'], line_cfg['to'])
+        return not nx.is_connected(G)
+    def _check_n1_post_action(
+        self,
+        topo_actions: List[TopologyAction],
+        current_line_state: List[Dict],
+    ) -> int:
+        """Check N-1 security after applying proposed topology actions.
+        For each remaining connected line, simulate its loss and check
+        connectivity. Uses edge removal/restoration instead of rebuilding
+        the full graph for each contingency.
+        Returns the number of single-line contingencies that would island.
+        """
+        # Build the post-action line state
+        post_state = {}
+        for l_dyn in current_line_state:
+            post_state[l_dyn['id']] = l_dyn.get('connected', True)
+        for t_act in topo_actions:
+            post_state[t_act.line_id] = (t_act.action == "close")
+        # Build post-action graph once
+        G = nx.Graph()
+        G.add_nodes_from(range(self.num_buses))
+        edge_to_line = {}
+        for l_cfg in self.lines_config:
+            if post_state.get(l_cfg['id'], True):
+                u, v = l_cfg['from'], l_cfg['to']
+                G.add_edge(u, v)
+                edge_to_line[(u, v)] = l_cfg['id']
+        # Test each contingency via edge removal/restoration
+        n1_failures = 0
+        for (u, v), line_id in edge_to_line.items():
+            G.remove_edge(u, v)
+            if not nx.is_connected(G):
+                n1_failures += 1
+            G.add_edge(u, v)  # restore
+        return n1_failures
+    def reset(self):
+        """Reset any per-episode state (future-proofing)."""
+        pass

src/tasks.py ADDED Viewed

	@@ -0,0 +1,384 @@

+"""
+Grid Generator & Task Definitions
+===================================
+Generates reproducible power grid configurations for OpenGrid RL tasks.
+Procedural grids use Watts-Strogatz small-world topology with
+configurable difficulty (bus count, renewable penetration).
+The Karnataka task is a hand-crafted 15-bus grid based on the
+actual KPTCL transmission map.
+"""
+import copy
+import networkx as nx
+import numpy as np
+from typing import Dict, List, Tuple
+__all__ = ['generate_procedural_grid', 'generate_karnataka_task', 'TASKS', 'get_task']
+# KPTCL-inspired zone names
+def _get_zone_names(num_agents: int) -> List[str]:
+    """Get human-readable zone names for a given agent count."""
+    base_names = [
+        "Bengaluru_Region", "Mysuru_Region", "Kalburagi_Region",
+        "Belagavi_Region", "Mangaluru_Region",
+    ]
+    if num_agents <= len(base_names):
+        return base_names[:num_agents]
+    return [f"Zone_{i}" for i in range(num_agents)]
+def _partition_into_zones(G: nx.Graph, num_agents: int) -> Dict[int, int]:
+    """Partition graph nodes into balanced, connected zones.
+    Returns mapping of {bus_id: agent_id}.
+    Guarantees: every bus is assigned, each zone has at least 1 node,
+    and zones are roughly balanced in size.
+    NOTE: Uses greedy modularity which is deterministic for a given graph
+    structure but not guaranteed across NetworkX versions.
+    """
+    nodes = sorted(G.nodes())
+    n = len(nodes)
+    if n <= num_agents:
+        # Trivial case: 1 bus per agent
+        return {node: i for i, node in enumerate(nodes)}
+    try:
+        communities = nx.community.greedy_modularity_communities(G, cutoff=num_agents)
+        communities = [set(c) for c in sorted(communities, key=len, reverse=True)]
+    except Exception:
+        # Fallback: round-robin assignment by node index
+        communities = [set() for _ in range(num_agents)]
+        for i, node in enumerate(nodes):
+            communities[i % num_agents].add(node)
+    # If we got more communities than agents, merge smallest into largest
+    while len(communities) > num_agents:
+        smallest = communities.pop()
+        communities[0] = communities[0] | smallest
+    # If we got fewer, split the largest using topology-aware bisection
+    while len(communities) < num_agents:
+        largest = max(communities, key=len)
+        communities.remove(largest)
+        # Attempt topology-aware split
+        subG = G.subgraph(largest).copy()
+        split_done = False
+        if nx.is_connected(subG) and len(largest) >= 2:
+            # Find edge whose removal creates the most balanced partition
+            best_edge, best_balance = None, float('inf')
+            target = len(largest) / 2
+            for u, v in subG.edges():
+                subG.remove_edge(u, v)
+                components = list(nx.connected_components(subG))
+                if len(components) == 2:
+                    balance = abs(len(components[0]) - target) + abs(len(components[1]) - target)
+                    if balance < best_balance:
+                        best_edge = (u, v)
+                        best_balance = balance
+                subG.add_edge(u, v)
+            if best_edge:
+                subG.remove_edge(*best_edge)
+                parts = list(nx.connected_components(subG))
+                communities.extend(parts)
+                split_done = True
+        if not split_done:
+            # Fallback: arbitrary split
+            largest_list = sorted(largest)
+            half = len(largest) // 2
+            communities.append(set(largest_list[:half]))
+            communities.append(set(largest_list[half:]))
+    # Ensure no empty zones
+    for i, comm in enumerate(communities):
+        if len(comm) == 0:
+            # Steal a node from the largest community
+            largest = max(communities, key=len)
+            stolen = next(iter(largest))
+            largest.remove(stolen)
+            communities[i] = {stolen}
+    zone_map = {}
+    for agent_id, comm in enumerate(communities):
+        for node in comm:
+            zone_map[node] = agent_id
+    return zone_map
+def _classify_lines(
+    lines_config: List[Dict], zone_assignments: Dict[int, int]
+) -> Tuple[Dict[int, List[str]], Dict[int, List[str]]]:
+    """Classify lines as internal (both endpoints in same zone) or boundary.
+    Returns:
+        internal_lines: {agent_id: [line_ids within this zone]}
+        boundary_lines: {agent_id: [line_ids on this zone's boundary]}
+    """
+    agents = set(zone_assignments.values())
+    internal = {a: [] for a in agents}
+    boundary = {a: [] for a in agents}
+    for line in lines_config:
+        from_zone = zone_assignments.get(line['from'])
+        to_zone = zone_assignments.get(line['to'])
+        # Skip lines with unassigned bus endpoints
+        if from_zone is None or to_zone is None:
+            continue
+        if from_zone == to_zone:
+            internal[from_zone].append(line['id'])
+        else:
+            boundary[from_zone].append(line['id'])
+            boundary[to_zone].append(line['id'])
+    return internal, boundary
+def generate_procedural_grid(difficulty: str = "easy", seed: int = 42):
+    """Generate a reproducible grid configuration for a given difficulty level.
+    Easy:   5 buses, 20% renewables — simple balancing
+    Medium: 10 buses, 50% renewables — congestion management
+    Hard:   14 buses, 70% renewables — volatile supply, tight margins
+    Guarantees: at least 30% of non-slack buses are loads, and at least 1 battery.
+    Includes multi-agent zone assignments for POMDP mode.
+    """
+    rng = np.random.default_rng(seed)
+    if difficulty == "easy":
+        n_buses = 5
+        renewable_mix = 0.2
+        max_steps = 50
+        num_agents = 2  # Small grid: 2 agents
+    elif difficulty == "medium":
+        n_buses = 10
+        renewable_mix = 0.5
+        max_steps = 50
+        num_agents = 3
+    else:  # Hard
+        n_buses = 14
+        renewable_mix = 0.7
+        max_steps = 50
+        num_agents = 3
+    G = nx.connected_watts_strogatz_graph(n_buses, k=4, p=0.3, seed=seed)
+    # Generate bus types with guaranteed minimums
+    n_non_slack = n_buses - 1
+    min_loads = max(2, int(n_non_slack * 0.3))  # At least 30% loads
+    min_batteries = 1
+    types = ['slack']
+    # Assign guaranteed loads first
+    assigned = []
+    for _ in range(min_loads):
+        assigned.append('load')
+    for _ in range(min_batteries):
+        assigned.append('battery')
+    # Fill remaining slots with renewable_mix probability
+    remaining = n_non_slack - len(assigned)
+    for _ in range(remaining):
+        r = rng.random()
+        if r < renewable_mix:
+            assigned.append(str(rng.choice(['solar', 'wind'])))
+        elif r < renewable_mix + 0.15:
+            assigned.append('battery')
+        else:
+            assigned.append('load')
+    # Shuffle to avoid spatial bias (loads always first)
+    rng.shuffle(assigned)
+    types.extend(assigned)
+    # Estimate total load for slack bus sizing
+    load_estimates = []
+    buses = []
+    lines = []
+    for i, t in enumerate(types):
+        base_p = float(rng.uniform(20, 50)) if t == 'load' else 0
+        if t == 'load':
+            load_estimates.append(base_p)
+        # Set max_p based on bus type
+        if t == 'battery':
+            max_p = float(rng.uniform(30, 60))  # batteries can discharge
+        elif t in ['solar', 'wind', 'generator']:
+            max_p = float(rng.uniform(50, 100))
+        elif t == 'slack':
+            # Slack max_p sized to cover expected imbalance
+            max_p = 0  # placeholder, set below
+        else:
+            max_p = 0
+        buses.append({
+            'id': i, 'type': t,
+            'base_p': base_p,
+            'max_p': max_p,
+            'min_p': 0 if t in ['solar', 'wind', 'generator'] else -50,
+            'capacity': 50 if t == 'battery' else 0,
+            'init_soc': 25.0 if t == 'battery' else 0,
+            'ramp_rate': 20.0 if t not in ['load', 'solar', 'wind'] else 0.0,
+        })
+    # Size slack bus to cover expected imbalance
+    total_load_est = sum(load_estimates) if load_estimates else 100
+    slack_max_p = max(100, total_load_est * 0.6)
+    for b in buses:
+        if b['type'] == 'slack':
+            b['max_p'] = slack_max_p
+            b['min_p'] = -slack_max_p
+    for idx, (u, v) in enumerate(G.edges()):
+        lines.append({
+            'id': f"L_{idx}",
+            'from': u, 'to': v,
+            'susceptance': 50.0,
+            'capacity': float(rng.uniform(80, 150))
+        })
+    # Multi-agent zone assignment
+    zone_assignments = _partition_into_zones(G, num_agents)
+    internal_lines, boundary_lines = _classify_lines(lines, zone_assignments)
+    zone_names = _get_zone_names(num_agents)
+    # Build per-zone bus lists
+    zone_bus_ids = {a: [] for a in range(num_agents)}
+    for bus_id, agent_id in zone_assignments.items():
+        zone_bus_ids[agent_id].append(bus_id)
+    return {
+        "id": f"task_{difficulty}",
+        "num_buses": n_buses,
+        "buses": buses,
+        "lines": lines,
+        "max_steps": max_steps,
+        "seed": seed,
+        "difficulty": difficulty,
+        # Multi-agent fields
+        "num_agents": num_agents,
+        "zone_assignments": zone_assignments,  # {bus_id: agent_id}
+        "zone_names": zone_names,
+        "zone_bus_ids": zone_bus_ids,  # {agent_id: [bus_ids]}
+        "internal_lines": internal_lines,  # {agent_id: [line_ids]}
+        "boundary_lines": boundary_lines,  # {agent_id: [line_ids]}
+    }
+def generate_karnataka_task(seed: int = 808) -> Dict:
+    """
+    A highly realistic 15-bus grid topology based on the actual Karnataka
+    KPTCL transmission map. Nodes have real GPS coordinates for GIS rendering.
+    """
+    nodes = [
+        {"id": 0, "name": "Raichur_TPS", "type": "slack", "lat": 16.20, "lon": 77.36, "max_p": 200, "base_p": 0},
+        {"id": 1, "name": "Kalaburagi", "type": "load", "lat": 17.33, "lon": 76.83, "max_p": 0, "base_p": 40},
+        {"id": 2, "name": "Belagavi", "type": "load", "lat": 15.85, "lon": 74.50, "max_p": 0, "base_p": 35},
+        {"id": 3, "name": "Hubballi", "type": "load", "lat": 15.36, "lon": 75.13, "max_p": 0, "base_p": 45},
+        {"id": 4, "name": "Ballari_TPS", "type": "generator", "lat": 15.14, "lon": 76.92, "max_p": 150, "base_p": 0},
+        {"id": 5, "name": "Chitradurga_Wind", "type": "wind", "lat": 14.23, "lon": 76.40, "max_p": 80, "base_p": 0},
+        {"id": 6, "name": "Pavagada_Solar", "type": "solar", "lat": 14.10, "lon": 77.27, "max_p": 120, "base_p": 0},
+        {"id": 7, "name": "Sharavathi_Hydro", "type": "generator", "lat": 14.18, "lon": 74.83, "max_p": 100, "base_p": 0},
+        {"id": 8, "name": "Shivamogga", "type": "load", "lat": 13.93, "lon": 75.57, "max_p": 0, "base_p": 30},
+        {"id": 9, "name": "Mangaluru", "type": "load", "lat": 12.87, "lon": 74.88, "max_p": 0, "base_p": 50},
+        {"id": 10, "name": "Hassan_BESS", "type": "battery", "lat": 13.01, "lon": 76.10, "max_p": 50, "base_p": 0},
+        {"id": 11, "name": "Mysuru", "type": "load", "lat": 12.30, "lon": 76.64, "max_p": 0, "base_p": 40},
+        {"id": 12, "name": "Nelamangala", "type": "battery", "lat": 13.10, "lon": 77.39, "max_p": 50, "base_p": 0},
+        {"id": 13, "name": "Bengaluru_City", "type": "load", "lat": 12.97, "lon": 77.59, "max_p": 0, "base_p": 120},
+        {"id": 14, "name": "Kolar_Solar", "type": "solar", "lat": 13.13, "lon": 78.13, "max_p": 60, "base_p": 0},
+    ]
+    edges = [
+        (0,1), (0,4), (4,5), (4,6), (5,3), (3,2), (3,7),
+        (7,8), (8,9), (8,10), (9,10),  # (9,10) added: connects Mangaluru within zone 2
+        (10,11), (10,12), (5,12),
+        (6,12), (12,13), (13,14), (11,13)
+    ]
+    buses = []
+    for n in nodes:
+        buses.append({
+            'id': n['id'], 'name': n['name'], 'type': n['type'],
+            'lat': n['lat'], 'lon': n['lon'],
+            'base_p': n['base_p'], 'max_p': n['max_p'],
+            'min_p': 0 if n['type'] in ['solar', 'wind', 'generator'] else -50,
+            'capacity': 100 if n['type'] == 'battery' else 0,
+            'init_soc': 50.0 if n['type'] == 'battery' else 0,
+            'ramp_rate': 40.0 if n['type'] not in ['load', 'solar', 'wind'] else 0.0,
+        })
+    lines = []
+    for idx, (u, v) in enumerate(edges):
+        lines.append({
+            'id': f"L_{u}_{v}", 'from': u, 'to': v,
+            'susceptance': 80.0, 'capacity': 150.0
+        })
+    # Realistic agents based on regional discoms/SLDC zones
+    zone_assignments = {
+        0: 0, 1: 0, 4: 0,             # North Zone (Raichur/Bellary)
+        2: 1, 3: 1, 5: 1, 7: 1, 8: 1, # Hubli/Central Zone
+        9: 2, 10: 2, 11: 2,           # Mysuru/Coast Zone
+        6: 3, 12: 3, 13: 3, 14: 3     # Bengaluru Zone
+    }
+    internal_lines, boundary_lines = _classify_lines(lines, zone_assignments)
+    zone_bus_ids = {a: [] for a in range(4)}
+    for b_id, a_id in zone_assignments.items():
+        zone_bus_ids[a_id].append(b_id)
+    return {
+        "id": "task_karnataka",
+        "num_buses": len(buses),
+        "buses": buses,
+        "lines": lines,
+        "max_steps": 50,
+        "seed": seed,
+        "difficulty": "karnataka",
+        "num_agents": 4,
+        "zone_assignments": zone_assignments,
+        "zone_names": ["Kalaburagi_Region", "Hubballi_Region", "Mysuru_Region", "Bengaluru_Region"],
+        "zone_bus_ids": zone_bus_ids,
+        "internal_lines": internal_lines,
+        "boundary_lines": boundary_lines,
+    }
+def get_task(task_id: str) -> Dict:
+    """Get a deep-copied task config by ID."""
+    if task_id not in _TASK_GENERATORS:
+        raise ValueError(
+            f"Unknown task: {task_id}. "
+            f"Available: {list(_TASK_GENERATORS.keys())}"
+        )
+    return copy.deepcopy(_TASK_GENERATORS[task_id]())
+_TASK_GENERATORS = {
+    "task_easy": lambda: generate_procedural_grid("easy", seed=101),
+    "task_medium": lambda: generate_procedural_grid("medium", seed=102),
+    "task_hard": lambda: generate_procedural_grid("hard", seed=103),
+    "task_karnataka": lambda: generate_karnataka_task(),
+}
+# Deterministic tasks — same seed always produces the same grid
+# NOTE: These are shared instances. Use get_task() for a mutable copy.
+TASKS = {
+    "task_easy": generate_procedural_grid("easy", seed=101),
+    "task_medium": generate_procedural_grid("medium", seed=102),
+    "task_hard": generate_procedural_grid("hard", seed=103),
+    "task_karnataka": generate_karnataka_task()
+}

src/visualization.py ADDED Viewed

	@@ -0,0 +1,224 @@

+"""
+Grid Visualization — Dashboard Generator
+==========================================
+Generates a base64-encoded PNG dashboard with two panels:
+1. Grid topology with bus-type coloring and line-loading heat map
+2. Frequency stability trace over time
+Supports both GridObservation (single-agent) and ZoneObservation (multi-agent).
+"""
+import io
+import base64
+import logging
+from typing import List, Optional, Sequence, Dict, Tuple
+import matplotlib
+matplotlib.use('Agg')  # Non-interactive backend for server use
+import matplotlib.pyplot as plt
+from matplotlib.lines import Line2D
+import networkx as nx
+from .models import GridObservation
+logger = logging.getLogger(__name__)
+def _parse_line_endpoints(line_id: str) -> Optional[Tuple[int, int]]:
+    """Parse line ID format 'L_<from>_<to>' into endpoint bus IDs.
+    Returns (from, to) on success, None on parse failure.
+    Requires exactly the format L_<int>_<int>.
+    """
+    try:
+        parts = line_id.split('_')
+        if len(parts) == 3 and parts[0] == "L":
+            return int(parts[1]), int(parts[2])
+    except (ValueError, IndexError):
+        pass
+    return None
+def generate_dashboard(
+    history: Sequence,
+    current_obs,
+    config: Optional[Dict] = None,
+) -> str:
+    """Generate a base64-encoded PNG dashboard image.
+    Args:
+        history: Sequence of observation objects for frequency trace.
+        current_obs: Current GridObservation or ZoneObservation for topology.
+        config: Optional grid config dict. When provided, line endpoints
+                are read from config (robust) instead of parsed from IDs.
+    Returns:
+        Base64-encoded PNG image string (without data URI prefix).
+    """
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
+    try:
+        # Support both GridObservation and ZoneObservation
+        buses = getattr(current_obs, "buses",
+                        getattr(current_obs, "local_buses", []))
+        lines = getattr(current_obs, "lines", None)
+        if lines is None:
+            internal = getattr(current_obs, "internal_lines", [])
+            boundary = getattr(current_obs, "boundary_lines", [])
+            lines = list(internal) + list(boundary)
+        # Build line endpoint lookup from config if available
+        line_endpoints: Dict[str, Tuple[int, int]] = {}
+        if config:
+            for l_cfg in config.get("lines", []):
+                line_endpoints[l_cfg["id"]] = (l_cfg["from"], l_cfg["to"])
+        # --- Plot 1: Grid Topology ---
+        G = nx.Graph()
+        color_map = {}
+        for bus in buses:
+            G.add_node(bus.id)
+            if bus.type in ['generator', 'slack']:
+                color_map[bus.id] = '#2ecc71'  # green
+            elif bus.type == 'load':
+                color_map[bus.id] = '#e74c3c'  # red
+            elif bus.type == 'battery':
+                color_map[bus.id] = '#3498db'  # blue
+            else:
+                color_map[bus.id] = '#f1c40f'  # yellow (renewables)
+        # Build graph with line data as edge attributes
+        for line in lines:
+            # Get endpoints from config (preferred) or parse from ID
+            if line.id in line_endpoints:
+                u, v = line_endpoints[line.id]
+            else:
+                parsed = _parse_line_endpoints(line.id)
+                if parsed is None:
+                    continue
+                u, v = parsed
+            G.add_edge(u, v, line_id=line.id, rho=line.rho,
+                       connected=line.connected)
+        # Build edge colors in G.edges() order (correct alignment)
+        edge_colors = []
+        edge_styles = []
+        for u, v, data in G.edges(data=True):
+            connected = data.get('connected', True)
+            rho = abs(data.get('rho', 0.0))
+            if not connected:
+                edge_colors.append('lightgray')
+                edge_styles.append('dashed')
+            elif rho > 0.9:
+                edge_colors.append('#e74c3c')  # red
+                edge_styles.append('solid')
+            elif rho > 0.7:
+                edge_colors.append('#e67e22')  # orange
+                edge_styles.append('solid')
+            else:
+                edge_colors.append('#2ecc71')  # green
+                edge_styles.append('solid')
+        node_colors = [color_map.get(n, 'gray') for n in G.nodes()]
+        # Use config coordinates if available (stable layout)
+        pos = None
+        if config:
+            bus_coords = {}
+            for b_cfg in config.get("buses", []):
+                if "lon" in b_cfg and "lat" in b_cfg:
+                    bus_coords[b_cfg["id"]] = (b_cfg["lon"], b_cfg["lat"])
+            if len(bus_coords) == G.number_of_nodes():
+                pos = bus_coords
+        if pos is None and G.number_of_nodes() > 0:
+            pos = nx.spring_layout(G, seed=42)
+        if G.number_of_nodes() > 0 and pos:
+            # Draw solid edges
+            solid_edges = [
+                (u, v) for (u, v, _), s in zip(G.edges(data=True), edge_styles)
+                if s == 'solid'
+            ]
+            solid_colors = [
+                c for c, s in zip(edge_colors, edge_styles) if s == 'solid'
+            ]
+            dashed_edges = [
+                (u, v) for (u, v, _), s in zip(G.edges(data=True), edge_styles)
+                if s == 'dashed'
+            ]
+            dashed_colors = [
+                c for c, s in zip(edge_colors, edge_styles) if s == 'dashed'
+            ]
+            nx.draw_networkx_nodes(
+                G, pos, ax=ax1, node_color=node_colors, node_size=300
+            )
+            nx.draw_networkx_labels(G, pos, ax=ax1, font_size=8)
+            if solid_edges:
+                nx.draw_networkx_edges(
+                    G, pos, ax=ax1, edgelist=solid_edges,
+                    edge_color=solid_colors, width=2, style='solid'
+                )
+            if dashed_edges:
+                nx.draw_networkx_edges(
+                    G, pos, ax=ax1, edgelist=dashed_edges,
+                    edge_color=dashed_colors, width=1, style='dashed'
+                )
+            # Legend
+            legend_elements = [
+                Line2D([0], [0], marker='o', color='w',
+                       markerfacecolor='#2ecc71', markersize=10,
+                       label='Generator/Slack'),
+                Line2D([0], [0], marker='o', color='w',
+                       markerfacecolor='#e74c3c', markersize=10,
+                       label='Load'),
+                Line2D([0], [0], marker='o', color='w',
+                       markerfacecolor='#3498db', markersize=10,
+                       label='Battery'),
+                Line2D([0], [0], marker='o', color='w',
+                       markerfacecolor='#f1c40f', markersize=10,
+                       label='Renewable'),
+            ]
+            ax1.legend(handles=legend_elements, loc='upper left', fontsize=7)
+        else:
+            ax1.text(0.5, 0.5, "No buses in observation",
+                     ha='center', va='center', transform=ax1.transAxes)
+        ax1.set_title("Grid Topology & Loading")
+        # --- Plot 2: Frequency Trace ---
+        if history:
+            history_sorted = sorted(history, key=lambda h: h.timestep)
+            timesteps = [h.timestep for h in history_sorted]
+            freqs = [h.grid_frequency for h in history_sorted]
+            ax2.plot(timesteps, freqs, label='Frequency (Hz)',
+                     color='#2980b9', linewidth=1.5)
+            ax2.axhline(y=50.0, color='k', linestyle='--', linewidth=0.8)
+            ax2.fill_between(timesteps, 49.5, 50.5,
+                             color='green', alpha=0.1, label='Normal band')
+            ax2.legend(fontsize=8)
+        else:
+            ax2.text(0.5, 0.5, "No frequency history",
+                     ha='center', va='center', transform=ax2.transAxes)
+        ax2.set_title("Frequency Stability")
+        ax2.set_xlabel("Timestep")
+        ax2.set_ylabel("Hz")
+        ax2.set_ylim(48.5, 51.5)
+        fig.tight_layout()
+        buf = io.BytesIO()
+        fig.savefig(buf, format='png', bbox_inches='tight')
+        buf.seek(0)
+        return base64.b64encode(buf.read()).decode('utf-8')
+    finally:
+        plt.close(fig)

static/app.js ADDED Viewed

	@@ -0,0 +1,680 @@

+// OpenGrid Control Room
+const API = window.location.origin;
+const AGENT_COLORS = ['#00bfff','#ff69b4','#ff6347','#32cd32','#9370db','#ffa500'];
+const AGENT_NAMES = ['Bengaluru','Mysuru','Kalburagi','Hassan','Tumakuru','Bagalkot'];
+// Real Karnataka state boundary path (source: @svg-maps/india)
+const KARNATAKA_PATH = "m 124.338,505.46021 -0.617,-0.44733 0.776,-0.16422 -0.063,-0.8604 1.544,-0.77275 0.48,-0.70223 0.476,0.96821 0.881,0.0413 1.521,-0.74857 0.512,-1.53442 -0.938,-0.17228 0.62,-0.86141 0.404,0.86745 0.379,-0.0181 -0.412,-1.05888 1.641,-3.03861 -0.711,-0.35364 -0.968,0.47151 -0.458,-0.38889 1.391,-1.25837 1.141,0.50879 -0.068,-1.30269 0.567,-0.8997 -0.205,-0.93495 -1.688,-0.57629 -0.027,-0.50476 -1.422,-0.24583 -0.407,0.51987 0.312,-0.51181 -0.538,-0.73446 0.051,-1.1828 0.369,-0.24886 0.389,0.56622 0.156,-0.64581 -0.554,-0.135 -0.079,-1.12941 -0.891,-0.14911 0.075,-0.95309 -0.652,0.58133 -0.327,-0.41207 0.683,-0.18639 -0.196,-0.9007 0.79,0.92891 0.32,-1.12336 0.758,-0.0786 -0.063,0.39998 0.572,0.23676 0.284,-1.11026 1.444,-0.57126 0.104,-1.2241 0.432,0.74655 1.118,-0.14407 0.474,1.77622 1.304,-0.51987 0.135,-0.67805 0.996,0.0504 -0.625,-0.72439 0.746,-0.8191 0.043,-0.88055 3.282,-1.21706 1.441,0.0192 -0.248,-1.88302 1.091,-0.48057 0.066,-0.60249 -0.842,-0.44329 0.238,-0.33752 1.924,-0.0121 0.034,0.3486 1.225,-0.50375 1.062,1.64625 1.016,0 -0.135,0.69014 0.684,0.0373 1.401,-0.74252 0.119,-1.76514 1.19,0.0494 1.035,-0.52289 0.759,0.28311 0.772,-0.47957 0.515,0.92992 1.629,-0.45438 0.114,-0.9672 0.706,0.10276 0.024,0.73447 0.719,0.40703 0.619,-0.20251 -0.049,-1.65431 -0.596,-0.0151 0.725,-0.57931 0.002,-0.68712 -1.057,-1.6664 0.714,-0.83722 -0.047,-1.16568 -1.129,-0.91884 0.15,-0.85738 -0.592,-0.16422 -0.131,-0.72741 0.78,-0.19646 0.414,-1.88201 0.878,0.4302 0.285,0.99642 0.96,-0.20352 1.367,1.13646 0.469,-1.15761 0.779,0.81405 0.529,-0.69215 0.134,1.39841 0.785,0.64883 2.583,-0.66294 0.506,0.53196 0.889,-0.79693 0.877,0.55916 0.264,0.96015 -0.072,-1.13243 1.508,-0.56823 0.659,0.96922 1.418,-0.42618 0.181,0.86343 0.616,-0.0262 0.552,-1.2634 -0.964,-0.12593 0.234,-1.3037 -0.827,0.0463 -0.06,-0.80197 0.926,-0.54304 -0.661,-0.0191 0.474,-0.61155 -0.546,-0.44733 -0.175,-1.14955 1.758,-0.20553 0.273,-0.88459 1.268,-0.35766 -0.062,-1.16265 0.781,-0.0373 0.001,-0.96115 1.038,-0.0242 -0.001,1.27348 0.863,-1.45483 1.02,1.77017 0.573,0.1743 0.159,-1.01455 0.617,-0.24079 -0.249,-0.98735 0.985,-0.11384 0.532,-0.86746 -1.061,-0.67301 0.067,-0.90271 1.3,0.65386 2.379,-1.03067 0.026,-2.60337 0.773,-0.14206 -0.16,-0.75159 0.445,-0.51584 -0.957,-0.41912 0.661,-1.51628 0.707,-0.0796 0.755,0.56923 0.186,-0.46546 0.52,0.69316 1.072,-0.008 -0.279,-0.93496 1.14,-0.47453 0.43,-1.41956 0.746,0.0645 -0.226,-0.76772 1.039,-1.27851 -0.101,-0.84126 1.616,-0.99742 0.517,0.51987 0.577,-0.38386 0.002,1.03772 0.845,0.269 -1.074,1.7198 1.624,0.0917 0.607,1.02866 0.938,-0.40804 -0.015,-0.62465 0.847,0.33953 0,0 1.11,0.2952 -0.81,1.70972 0.701,1.07298 0.059,1.15661 -1.148,1.00649 0.974,0.96115 1.129,0.37378 0.151,0.52592 -0.197,0.50576 -0.424,-0.25087 -0.15,1.209 -0.657,-0.11788 0.241,0.83219 -0.501,-0.0524 -0.482,1.20598 -0.497,-0.19243 -0.316,0.55916 -0.134,0.41509 1.287,0.40501 -0.083,0.37479 -2.338,1.22814 -0.218,2.41597 1.049,0.33349 0.243,0.55815 0.54,-0.71029 0.439,0.5229 0.867,-0.29319 0.04,0.66193 1.965,0.59442 -0.034,0.72036 -0.752,-0.18336 0.098,-0.48461 -0.258,0.59946 -0.617,0.134 -0.007,0.56521 -0.783,-0.21964 -0.013,0.54203 -1.307,0.0504 0.531,0.50879 -0.157,0.70222 -0.605,0.39595 -0.995,-0.35968 -0.368,1.80544 0.429,0.27202 -1.552,1.23318 0.386,0.24079 -0.812,1.03369 2.148,1.26239 0.77,2.12078 -0.963,2.15403 0.372,2.84517 -0.704,-0.0887 -0.296,1.50218 0.909,0.0564 -0.037,0.73648 -1.015,0.33852 0.343,0.5511 0.763,-0.2025 -0.109,1.3712 -1.522,0.41509 0.5,1.0357 -0.758,0.15516 -0.268,0.58132 -1.458,-0.16019 -0.097,0.3899 -1.189,0.12191 1.036,1.42158 1.22,0.50879 1.44,0.37176 1.732,-0.28613 2.033,0.83622 -0.027,1.10724 -0.53,-0.23676 -0.653,0.7657 -0.682,-0.11284 -0.286,0.39393 0.025,0.55614 0.46,0.0212 -0.568,1.41352 0.064,0.93395 0.476,0.26698 -0.391,1.59084 0.405,0.6186 -0.014,1.7742 0,0 -5.454,-0.80499 -2.208,0.37379 -1.622,0.9007 -0.915,1.47195 0.871,0.1884 -0.433,1.93137 1.711,1.64222 -0.184,0.7385 -0.728,-0.6045 -1.092,0.41408 -0.056,2.91167 -1.145,-0.13803 -0.032,0.3355 1.193,1.21202 0.715,2.46333 1.007,-0.11788 0.12,0.81406 0.68,0.0907 0.34,0.5773 -0.906,0.82212 0.78,0.59543 -0.01,0.97425 -0.536,0.13601 0.459,0.48158 -1.574,2.61647 -0.792,-0.11788 0.123,-0.51583 -0.967,-0.008 -0.395,0.4171 -2.2,-0.39796 -1.67,-1.34803 -0.475,0.42113 0.216,1.18784 -0.435,1.01455 1.342,0.40904 0.765,-0.2821 -0.329,3.46982 -1.432,0.53599 0.371,0.96821 -0.793,2.87338 0.828,1.60897 0.583,0.10075 0.893,1.1828 2.16,-0.21258 -0.62,0.71835 -0.046,0.98633 -0.596,-0.30325 -0.627,0.50375 -0.084,0.94805 1.486,1.13344 -0.528,0.47251 0.271,0.69316 2.34,0.0121 0.538,0.65286 0.623,-0.0846 0.143,-1.60595 0.842,-1.08709 1.67,0.57729 1.03,-0.43423 -0.033,1.19086 1.667,0.1471 -0.081,0.84932 0.594,0.82615 0.668,-0.43524 -0.852,-1.8004 0.223,-0.64278 1.187,0.32743 0.259,0.81305 0.87,0.009 0.087,2.61143 -2.317,-0.3093 -0.272,0.77174 -0.606,0.11284 -0.067,0.61155 0.946,-0.48662 0.12,0.32643 -1.45,1.73995 1.197,0.3365 -0.162,0.82514 1.151,0.008 -0.48,0.70525 0.413,0.58032 -0.744,0.63372 -0.03,-0.38487 -0.881,0.0242 0.114,-1.85279 -0.843,-0.91682 -3.478,0.71734 -0.549,-1.09213 -2.039,-0.23978 -0.322,-0.96921 0.241,-1.61301 -1.637,-0.35766 -0.098,0.50577 -0.954,0.14911 -0.162,0.60449 0.907,0.0826 1.005,1.33896 -0.919,0.91884 2.193,2.09761 0.114,0.60853 -0.502,0.0876 -1.023,1.70468 0.506,1.54953 0.395,0.21158 0.313,-0.83522 0.706,0.70324 0.737,-0.47554 1.493,0.134 -0.091,-1.42461 -0.803,-0.6186 0.809,-0.16422 -0.137,-0.92488 0.441,-0.37983 0.037,1.24325 0.547,-0.46042 0.138,0.42617 0.467,-0.72339 0.348,1.19691 1.182,-0.38386 0.274,0.68006 0.826,-0.4302 1.362,0.2277 -0.332,0.77476 1.021,0.0474 -0.161,2.61646 0.695,-0.0846 0.092,-0.58435 0.522,0.45539 0.154,-1.25535 0.762,0.59141 0.828,-0.58536 0.537,0.3496 0.324,-0.16926 -0.55,-0.43624 0.809,-0.44028 0.442,-0.0363 -0.136,0.54505 0.666,-0.19746 0.276,0.6186 0.086,-1.24527 1.374,-0.48259 -0.051,-0.49669 1.082,-0.53297 -0.447,-1.03671 0.25,-0.56723 1.438,0.0796 0.515,0.73345 1.148,-1.15156 0.243,1.23519 -0.745,0.2831 0.044,1.30169 0.444,-0.005 0.406,-0.89566 1.102,0.18941 0.07,-0.73145 1.516,0.98937 0.098,1.37926 -0.697,0.93798 0.512,0.50173 -0.084,0.55715 -0.865,-0.0816 -0.12,0.4574 0.469,0.68309 1.57,-0.28815 0.1,0.54506 0.7,0.15616 -0.224,1.28859 0.93,-0.66394 3.414,0.19545 -0.746,4.80576 0.884,0.48965 -0.636,0.26497 0.508,0.35262 0.695,-0.33146 0.241,0.44632 0.749,-0.20553 1.027,1.12638 0.729,-0.94402 0.457,0.80499 -0.184,1.24425 -0.581,0.36573 0.589,0.66091 -1.263,0.79391 0.402,0.47957 -0.545,0.33751 0.056,0.62163 -1.11,0.45639 0.133,1.46793 -0.738,-0.11486 0.275,1.46087 -1.203,0.11788 -0.689,-0.70726 -0.886,1.73994 -1.298,-0.005 -0.428,2.03715 0,0 -2.093,-0.37478 -1.548,-1.55457 -0.666,-0.0756 -0.281,1.08406 -0.42,-0.004 -0.75,-1.15459 -0.435,0.7657 -0.326,-0.17833 0.528,-0.73245 -0.35,-0.48057 -2.781,0.95812 0.306,1.00952 -1.425,2.63964 -0.578,-0.20956 -0.533,0.52994 -0.504,-0.54002 -1.339,0.35666 0.157,0.78484 -0.582,1.35407 0.177,1.09314 0.583,0.15515 -0.649,0.67402 1.043,-0.19042 -0.107,1.47699 -0.34,1.17273 -1.279,1.50318 -1.518,0.46849 -0.095,1.27851 5.457,0.74958 0.881,1.32285 -1.654,2.04924 -0.607,1.53744 -3.686,0.12292 -0.157,1.20799 -0.505,-0.269 0.073,1.05888 -0.775,1.89308 -1.251,-0.60147 -0.699,0.42415 -0.864,-0.84327 -0.902,-0.0877 -0.308,0.39997 -2.601,0.44129 0.136,1.076 -0.789,-0.26195 -0.316,-1.11429 -0.716,0.26598 0.195,-0.41106 -0.57,-0.40803 -0.663,0.0413 -0.276,0.76872 -1.254,-0.38788 -1.49,2.97816 0.469,0.90473 -0.285,0.58435 -0.435,-0.48562 -1.471,-0.28512 -3.897,0.009 -0.412,-1.29161 -0.758,-0.58939 -1.106,0.91783 -0.584,-0.0897 0,0 -0.566,-0.89365 0.471,-0.34255 -0.235,-0.77678 -1.521,0.48561 -1.318,-1.56061 -1.12,0.0746 -0.722,-1.36415 -1.59,0.27001 -0.003,-2.55803 -2.375,1.01354 -2.464,-0.37278 -1.096,-0.93294 -0.517,-1.86185 -1.73,0.19444 -0.323,-0.81909 -0.82,0.19545 -0.572,-1.09817 -1.219,-0.17933 -1.97,-2.90361 -1.331,-0.005 0.047,-1.72382 -1.168,-0.86343 0.021,-0.95712 1.17,-0.18034 -0.168,-0.78686 -1.542,0.8725 -0.125,-0.73447 -1.125,-0.59946 -0.09,-0.98634 1.068,-0.45136 -1.071,-0.56823 -1.126,1.05183 -0.449,-1.34098 -0.885,0.17329 -0.339,-0.3496 0.161,-0.92388 -1.351,-0.46042 -0.063,0.67401 -0.739,0.13602 0.039,-1.17374 -0.891,0.11788 0.106,-0.29318 -0.574,-0.15012 0.499,-0.65689 -0.342,-0.6448 -2.621,0.77376 0,0 -0.965,-2.10365 -2.634,-10.6573 -0.512,-6.16488 -1.337,-5.02237 -0.768,-1.72786 -0.809,-0.39594 -0.627,-1.24728 -0.64,-3.47486 -0.611,-0.87048 -1.843,-6.03994 0.826,-0.61357 -0.599,-0.48662 -0.181,0.68611 -0.971,0.0302 -0.313,-1.75002 -0.524,-0.54808 0.32,-0.28814 -0.384,-1.61905 -0.669,-0.71633 -0.622,0.65084 -2.291,-1.75103 0.587,-0.13299 0.157,-0.89768 -0.396,-0.0121 -0.308,-1.05989 0,0 0.879,-0.538 0.754,0.24986 -0.068,-0.91279 0.831,0.64278 1.22,-0.98231 -0.176,-0.52289 0.851,-1.06593 -0.502,-1.21605 0.235,-1.02664 0.676,-0.8876 -0.318,-0.85033 -1.029,-0.74857 1.761,-0.76671 -0.278,-1.61602 -0.957,-0.46446 -0.003,-1.18481 -0.548,-1.00952 0.647,-0.7939 -0.69,-0.86645 0.298,-0.95108 -0.278,-0.97123 -0.496,-0.26799 -0.384,0.38083 -0.483,-0.69517 -0.35,0.62969 -0.986,0.0383 z";
+let state = {
+    sessionId: null, task: 'task_karnataka', step: 0, done: false,
+    numAgents: 0, zoneInfo: {}, observations: {}, taskConfigs: {},
+    rewardHistory: [], freqHistory: [], perAgentRewards: {},
+    totalReward: 0, autoRunning: false, autoTimer: null,
+    safetyTotal: 0, lastOversight: null, mapScale: 1, alarms: []
+};
+// --- Init ---
+document.addEventListener('DOMContentLoaded', () => {
+    document.querySelectorAll('.task-btn').forEach(btn => {
+        btn.addEventListener('click', () => {
+            document.querySelectorAll('.task-btn').forEach(b => b.classList.remove('active'));
+            btn.classList.add('active');
+            state.task = btn.dataset.task;
+        });
+    });
+    fetch(`${API}/tasks`).then(r=>r.json()).then(d=>{
+        d.forEach(t => state.taskConfigs[t.id] = t);
+        resetEpisode(); // reset only after configs are loaded
+        setTimeout(() => document.getElementById('loading').classList.add('hidden'), 800);
+    });
+});
+// --- API Calls ---
+async function resetEpisode() {
+    stopAutoRun();
+    state.step = 0; state.done = false; state.totalReward = 0;
+    state.rewardHistory = []; state.freqHistory = []; state.safetyTotal = 0;
+    state.alarms = [];
+    mapFitted = false;
+    document.getElementById('alarmLog').innerHTML = '';
+    document.getElementById('simStatus').textContent = 'RUNNING';
+    try {
+        const r = await fetch(`${API}/reset_multi?task_id=${state.task}`, {method:'POST'});
+        const d = await r.json();
+        state.sessionId = d.session_id;
+        state.numAgents = d.num_agents;
+        state.zoneInfo = d.zone_info;
+        state.observations = d.observations;
+        state.perAgentRewards = {};
+        for (let i = 0; i < d.num_agents; i++) state.perAgentRewards[i] = [];
+        updateAll();
+    } catch(e) { showAlert('critical', 'Reset failed: ' + e.message); }
+}
+async function stepEpisode() {
+    if (!state.sessionId || state.done) return;
+    const actions = {};
+    for (let i = 0; i < state.numAgents; i++) {
+        const obs = state.observations[String(i)];
+        actions[String(i)] = generateHeuristicAction(i, obs);
+    }
+    try {
+        const r = await fetch(`${API}/step_multi?session_id=${state.sessionId}`, {
+            method: 'POST', headers: {'Content-Type':'application/json'},
+            body: JSON.stringify({agent_actions: actions})
+        });
+        const d = await r.json();
+        state.step++;
+        state.observations = d.observations;
+        state.totalReward += d.team_reward;
+        state.rewardHistory.push(d.team_reward);
+        state.lastOversight = d.oversight_report;
+        state.done = d.done;
+        const freq = getAvgFreq(d.observations);
+        state.freqHistory.push(freq);
+        // safety_reports is a string-keyed dict {"0": {...}, "1": {...}}, not an array
+        Object.values(d.safety_reports || {}).forEach(sr => { if (sr.was_corrected) state.safetyTotal++; });
+        for (const [aid, rew] of Object.entries(d.rewards)) {
+            if (!state.perAgentRewards[aid]) state.perAgentRewards[aid] = [];
+            state.perAgentRewards[aid].push(rew.value);
+        }
+        if (d.done) {
+            document.getElementById('simStatus').textContent = d.info.is_blackout ? 'BLACKOUT' : 'COMPLETE';
+            stopAutoRun();
+        }
+        updateAll(d);
+    } catch(e) { showAlert('critical', 'Step failed: ' + e.message); stopAutoRun(); }
+}
+async function getGrade() {
+    if (!state.sessionId) return;
+    try {
+        const r = await fetch(`${API}/grader?session_id=${state.sessionId}`);
+        const d = await r.json();
+        document.getElementById('episodeScore').textContent = d.score.toFixed(4);
+        document.getElementById('episodeScore').style.color =
+            d.score > 0.7 ? 'var(--status-normal)' : d.score > 0.4 ? 'var(--status-warning)' : 'var(--status-critical)';
+    } catch(e) { showAlert('warning', 'Grade failed: ' + e.message); }
+}
+// --- Heuristic Agent ---
+function generateHeuristicAction(agentId, obs) {
+    if (!obs) return {bus_adjustments: [], topology_actions: []};
+    const freq = obs.grid_frequency || 50;
+    const error = 50.0 - freq;
+    const buses = obs.local_buses || [];
+    const adjs = [];
+    buses.forEach(b => {
+        // Exclude slack — physics solver overwrites its injection; adjusting it wastes the action
+        if (b.type === 'battery' || b.type === 'generator') {
+            let delta = error * 8;
+            delta = Math.max(-15, Math.min(15, delta));
+            if (Math.abs(delta) > 0.5) adjs.push({bus_id: b.id, delta: Math.round(delta*10)/10});
+        }
+    });
+    return {bus_adjustments: adjs, topology_actions: []};
+}
+// --- Auto Run ---
+function toggleAutoRun() {
+    if (state.autoRunning) { stopAutoRun(); }
+    else { state.autoRunning = true; document.getElementById('btnAutoRun').classList.add('active'); autoStep(); }
+}
+function stopAutoRun() {
+    state.autoRunning = false;
+    if (state.autoTimer) clearTimeout(state.autoTimer);
+    document.getElementById('btnAutoRun').classList.remove('active');
+}
+async function autoStep() {
+    if (!state.autoRunning || state.done) { stopAutoRun(); return; }
+    await stepEpisode();
+    if (state.autoRunning && !state.done) state.autoTimer = setTimeout(autoStep, 200);
+}
+// --- UI Updates ---
+function updateAll(stepData) {
+    updateHeader();
+    updateFrequency();
+    updateSystemSummary();
+    updateOversight();
+    updateAgentCards(stepData);
+    updateLeaderboard();
+    updateGridMap();
+    updateCharts();
+    updateAlarmLog(stepData);
+}
+function getAvgFreq(obs) {
+    let sum=0, n=0;
+    for (const o of Object.values(obs||state.observations)) { sum += (o.grid_frequency||50); n++; }
+    return n ? sum/n : 50;
+}
+function updateHeader() {
+    const maxSteps = state.taskConfigs[state.task]?.max_steps || 50;
+    document.getElementById('headerStep').textContent = `${state.step} / ${maxSteps}`;
+    document.getElementById('headerAgents').textContent = `${state.numAgents} Active`;
+    document.getElementById('headerReward').textContent = state.totalReward.toFixed(2);
+    document.getElementById('headerEpisode').textContent = state.task.replace('task_','').toUpperCase();
+    const freq = getAvgFreq();
+    const el = document.getElementById('headerFreq');
+    el.textContent = freq.toFixed(2) + ' Hz';
+    el.className = 'value ' + freqClass(freq);
+    document.getElementById('totalSteps').textContent = state.step;
+    document.getElementById('blackoutStatus').textContent = state.done && document.getElementById('simStatus').textContent==='BLACKOUT' ? 'Yes' : 'No';
+}
+function updateFrequency() {
+    const freq = getAvgFreq();
+    const cls = freqClass(freq);
+    const colors = {normal:'#00e5a0',warning:'#ffd700',critical:'#ff3d3d'};
+    const col = colors[cls];
+    // Arc gauge
+    const container = document.getElementById('freqArc');
+    const W=200, H=110, cx=100, cy=100, r=80;
+    const minF=49, maxF=51;
+    const pct = Math.max(0,Math.min(1,(freq-minF)/(maxF-minF)));
+    const startA=Math.PI, endA=0;
+    const needleA = startA - pct*(startA-endA);
+    let svg = `<svg viewBox="0 0 ${W} ${H}" xmlns="http://www.w3.org/2000/svg">`;
+    // Background arc
+    svg += `<path d="M${cx-r},${cy} A${r},${r} 0 0,1 ${cx+r},${cy}" fill="none" stroke="rgba(255,255,255,0.06)" stroke-width="10" stroke-linecap="round"/>`;
+    // Colored segments
+    const segs = [{f:49,t:49.5,c:'#ff3d3d'},{f:49.5,t:49.85,c:'#ffd700'},{f:49.85,t:50.15,c:'#00e5a0'},{f:50.15,t:50.5,c:'#ffd700'},{f:50.5,t:51,c:'#ff3d3d'}];
+    segs.forEach(s => {
+        const a1=Math.PI-((s.f-minF)/(maxF-minF))*Math.PI;
+        const a2=Math.PI-((s.t-minF)/(maxF-minF))*Math.PI;
+        const x1=cx+r*Math.cos(a1),y1=cy-r*Math.sin(a1);
+        const x2=cx+r*Math.cos(a2),y2=cy-r*Math.sin(a2);
+        svg += `<path d="M${x1},${y1} A${r},${r} 0 0,0 ${x2},${y2}" fill="none" stroke="${s.c}" stroke-width="6" opacity="0.25" stroke-linecap="round"/>`;
+    });
+    // Needle
+    const nx=cx+(r-12)*Math.cos(needleA), ny=cy-(r-12)*Math.sin(needleA);
+    svg += `<line x1="${cx}" y1="${cy}" x2="${nx}" y2="${ny}" stroke="${col}" stroke-width="2.5" stroke-linecap="round"/>`;
+    svg += `<circle cx="${cx}" cy="${cy}" r="4" fill="${col}"/>`;
+    // Value text
+    svg += `<text x="${cx}" y="${cy-20}" text-anchor="middle" fill="${col}" font-family="JetBrains Mono" font-size="28" font-weight="700" style="text-shadow:0 0 15px ${col}40">${freq.toFixed(2)}</text>`;
+    svg += `<text x="${cx}" y="${cy-6}" text-anchor="middle" fill="#90a4ae" font-family="Inter" font-size="11">Hz</text>`;
+    // Scale labels
+    svg += `<text x="18" y="${cy+14}" fill="#546e7a" font-size="8" font-family="JetBrains Mono">49.0</text>`;
+    svg += `<text x="${W-30}" y="${cy+14}" fill="#546e7a" font-size="8" font-family="JetBrains Mono">51.0</text>`;
+    svg += `<text x="${cx}" y="12" text-anchor="middle" fill="#546e7a" font-size="8" font-family="JetBrains Mono">50.0</text>`;
+    svg += '</svg>';
+    container.innerHTML = svg;
+    document.getElementById('freqDev').textContent = `Deviation: ${(freq-50).toFixed(3)} Hz | Nominal: 50.00 Hz`;
+    // Grid condition
+    const gc = document.getElementById('gridCondition');
+    const dev = Math.abs(freq-50);
+    if(dev<0.15){gc.textContent='NORMAL';gc.className='grid-condition normal';}
+    else if(dev<0.3){gc.textContent='CONSERVATIVE OPS';gc.className='grid-condition conservative';}
+    else if(dev<0.5){gc.textContent='CONSERVATION ALERT';gc.className='grid-condition alert';}
+    else{gc.textContent='EMERGENCY';gc.className='grid-condition emergency';}
+}
+function freqClass(f) { return Math.abs(f-50)<0.5?'normal':Math.abs(f-50)<1?'warning':'critical'; }
+function updateSystemSummary() {
+    let gen=0, load=0, lines=0, overloaded=0, totalLines=0;
+    for (const obs of Object.values(state.observations)) {
+        gen += obs.zone_gen_mw || 0;
+        load += obs.zone_load_mw || 0;
+        (obs.internal_lines||[]).concat(obs.boundary_lines||[]).forEach(l => {
+            totalLines++; if(l.connected) lines++;
+            if(l.connected && l.rho > 1) overloaded++;
+        });
+    }
+    document.getElementById('totalGen').textContent = gen.toFixed(1) + ' MW';
+    document.getElementById('totalLoad').textContent = load.toFixed(1) + ' MW';
+    document.getElementById('netBalance').textContent = (gen-load).toFixed(1) + ' MW';
+    document.getElementById('linesConnected').textContent = `${lines} / ${totalLines}`;
+    const olEl = document.getElementById('linesOverloaded');
+    olEl.textContent = overloaded;
+    olEl.style.color = overloaded > 0 ? 'var(--status-critical)' : 'var(--status-normal)';
+}
+function updateOversight() {
+    const o = state.lastOversight;
+    if (!o) return;
+    const cs = document.getElementById('coordScore');
+    cs.textContent = o.coordination_score.toFixed(2);
+    cs.style.color = o.coordination_score > 0.7 ? 'var(--status-normal)' : o.coordination_score > 0.4 ? 'var(--status-warning)' : 'var(--status-critical)';
+    document.getElementById('conflicts').textContent = o.conflicting_actions_detected;
+    document.getElementById('safetyCorrTotal').textContent = state.safetyTotal;
+    document.getElementById('selfishActions').textContent = o.selfish_actions_detected;
+}
+function updateAlarmLog(stepData) {
+    if (!stepData) return;
+    const logEl = document.getElementById('alarmLog');
+    let newAlarms = [];
+    const timeStr = `T+${String(state.step).padStart(2,'0')}s`;
+    // Check frequency
+    const freq = getAvgFreq();
+    if (Math.abs(freq - 50) > 0.5) {
+        newAlarms.push({t: timeStr, msg: `FREQ DEVIATION: ${freq.toFixed(2)} Hz`, type: Math.abs(freq-50)>1?'crit':'warn'});
+    }
+    // Check lines and safety
+    for (const [aid, obs] of Object.entries(state.observations)) {
+        (obs.internal_lines||[]).concat(obs.boundary_lines||[]).forEach(l => {
+            if (l.rho > 1.0) newAlarms.push({t: timeStr, msg: `OVERLOAD: Line ${l.id} at ${(l.rho*100).toFixed(0)}%`, type: 'crit'});
+            else if (l.rho > 0.9) newAlarms.push({t: timeStr, msg: `CONGESTION: Line ${l.id} at ${(l.rho*100).toFixed(0)}%`, type: 'warn'});
+        });
+        const sr = stepData.safety_reports?.[aid];
+        if (sr && sr.was_corrected) {
+            newAlarms.push({t: timeStr, msg: `AGENT ${aid} SAFETY CORRECTED`, type: 'warn'});
+        }
+    }
+    if (state.done && document.getElementById('simStatus').textContent==='BLACKOUT') {
+         newAlarms.push({t: timeStr, msg: `SYSTEM COLLAPSE - BLACKOUT`, type: 'crit'});
+    }
+    if (newAlarms.length > 0) {
+        state.alarms = [...newAlarms, ...state.alarms].slice(0, 50); // Keep last 50
+        logEl.innerHTML = state.alarms.map(a => `<div class="alarm-entry ${a.type}"><span class="alarm-time">[${a.t}]</span>${a.msg}</div>`).join('');
+    }
+}
+function updateAgentCards(stepData) {
+    const container = document.getElementById('agentCards');
+    container.innerHTML = '';
+    for (let i = 0; i < state.numAgents; i++) {
+        const obs = state.observations[String(i)];
+        const zi = state.zoneInfo[String(i)] || {};
+        const sr = stepData?.safety_reports?.[String(i)];
+        const rew = stepData?.rewards?.[String(i)];
+        const cumReward = (state.perAgentRewards[i]||[]).reduce((a,b)=>a+b,0);
+        const wasCorrected = sr?.was_corrected || false;
+        const cardClass = wasCorrected ? 'warning' : 'active';
+        const html = `
+        <div class="agent-card ${cardClass}">
+            <div class="agent-header">
+                <div class="agent-name">
+                    <span class="agent-dot" style="background:${AGENT_COLORS[i]}"></span>
+                    Agent ${i} - ${zi.zone_name||AGENT_NAMES[i]}
+                </div>
+                <span class="agent-status-badge ${wasCorrected?'corrected':'active'}">${wasCorrected?'Corrected':'Safe'}</span>
+            </div>
+            <div class="agent-metrics">
+                <div class="agent-metric">
+                    <div class="label">Step Reward</div>
+                    <div class="value" style="color:${(rew?.value||0)>=0?'var(--status-normal)':'var(--status-critical)'}">${(rew?.value||0).toFixed(2)}</div>
+                </div>
+                <div class="agent-metric">
+                    <div class="label">Cumulative</div>
+                    <div class="value">${cumReward.toFixed(1)}</div>
+                </div>
+                <div class="agent-metric">
+                    <div class="label">Zone Load</div>
+                    <div class="value">${(obs?.zone_load_mw||0).toFixed(0)} MW</div>
+                </div>
+                <div class="agent-metric">
+                    <div class="label">Zone Gen</div>
+                    <div class="value">${(obs?.zone_gen_mw||0).toFixed(0)} MW</div>
+                </div>
+            </div>
+            <div class="safety-shield ${wasCorrected?'corrected':'safe'}">
+                ${wasCorrected?'&#9888; Safety Corrected':'&#9635; Safety OK'}
+                ${sr?.blocked_topology_actions ? ` | ${sr.blocked_topology_actions} blocked` : ''}
+            </div>
+            <div class="sparkline-container"><svg id="spark${i}"></svg></div>
+        </div>`;
+        container.innerHTML += html;
+    }
+    // Draw sparklines
+    for (let i = 0; i < state.numAgents; i++) {
+        drawSparkline(`spark${i}`, state.perAgentRewards[i]||[], AGENT_COLORS[i]);
+    }
+}
+function updateLeaderboard() {
+    const lb = document.getElementById('leaderboard');
+    const agents = [];
+    for (let i = 0; i < state.numAgents; i++) {
+        const cum = (state.perAgentRewards[i]||[]).reduce((a,b)=>a+b,0);
+        const zi = state.zoneInfo[String(i)] || {};
+        agents.push({id:i, name: zi.zone_name||AGENT_NAMES[i], score: cum});
+    }
+    agents.sort((a,b) => b.score - a.score);
+    lb.innerHTML = agents.map((a,idx) => `
+        <li>
+            <span class="agent-label">
+                <span class="agent-dot" style="background:${AGENT_COLORS[a.id]};width:6px;height:6px;border-radius:50%;display:inline-block;"></span>
+                ${['#1','#2','#3'][idx]||'  '} ${a.name}
+            </span>
+            <span class="score" style="color:${AGENT_COLORS[a.id]}">${a.score.toFixed(1)}</span>
+        </li>`).join('');
+}
+// --- Grid Map (Leaflet) ---
+let leafletMap = null;
+let mapLayers = { lines: null, nodes: null, badges: null };
+let mapFitted = false;
+function initLeafletMap() {
+    const container = document.getElementById('gridMap');
+    if (leafletMap) return;
+    // Karnataka bounds
+    const kaBounds = [[11.5, 73.5], [18.5, 79.0]];
+    leafletMap = L.map(container, {
+        center: [14.5, 76.5],
+        zoom: 7,
+        zoomControl: true,
+        attributionControl: false,
+        minZoom: 5,
+        maxZoom: 15,
+        preferCanvas: true,
+    });
+    // Dark tile layer for SCADA aesthetic
+    L.tileLayer('https://{s}.basemaps.cartocdn.com/dark_all/{z}/{x}/{y}{r}.png', {
+        subdomains: 'abcd',
+        maxZoom: 19,
+    }).addTo(leafletMap);
+    // Attribution (small, bottom-right)
+    L.control.attribution({position: 'bottomright', prefix: false})
+        .addAttribution('© <a href="https://carto.com/">CARTO</a>')
+        .addTo(leafletMap);
+    // Layer groups for easy clearing
+    mapLayers.lines = L.layerGroup().addTo(leafletMap);
+    mapLayers.nodes = L.layerGroup().addTo(leafletMap);
+    mapLayers.badges = L.layerGroup().addTo(leafletMap);
+    // Fix Leaflet size after container is fully rendered
+    setTimeout(() => {
+        leafletMap.invalidateSize();
+        leafletMap.fitBounds(kaBounds, { padding: [20, 20] });
+    }, 200);
+}
+function updateGridMap() {
+    if (!leafletMap) initLeafletMap();
+    // Clear previous layers
+    mapLayers.lines.clearLayers();
+    mapLayers.nodes.clearLayers();
+    mapLayers.badges.clearLayers();
+    const typeIcons = {slack:'S',generator:'G',load:'L',battery:'B',solar:'PV',wind:'W'};
+    const typeColors = {slack:'#00e5a0',generator:'#f5a623',load:'#e94560',battery:'#4a90d9',solar:'#ffeb3b',wind:'#64ffda'};
+    // Collect buses — merge static config with runtime state
+    let allBuses = [];
+    const taskCfg = state.taskConfigs[state.task];
+    const runtimeState = {};
+    for (const obs of Object.values(state.observations)) {
+        (obs.local_buses||[]).forEach(b => { runtimeState[b.id] = b; });
+    }
+    if (taskCfg && taskCfg.buses) {
+        allBuses = taskCfg.buses.map(b => {
+            const rt = runtimeState[b.id];
+            return {...b, p_injection: rt ? rt.p_injection : (b.base_p || 0)};
+        });
+    } else {
+        allBuses = Object.values(runtimeState);
+    }
+    const hasGPS = allBuses.some(b => b.lat !== undefined && b.lon !== undefined);
+    // For non-GPS tasks, generate fake positions around Karnataka center
+    const busPositions = {};
+    const zones = [
+        {id:0, lat:16.8, lon:76.8, color:AGENT_COLORS[0], label:'Kalaburagi'},
+        {id:1, lat:15.2, lon:75.2, color:AGENT_COLORS[1], label:'Hubballi'},
+        {id:2, lat:12.8, lon:75.5, color:AGENT_COLORS[2], label:'Mysuru'},
+        {id:3, lat:13.2, lon:77.5, color:AGENT_COLORS[3], label:'Bengaluru'},
+    ];
+    allBuses.forEach((b, idx) => {
+        const aid = findAgent(b.id);
+        let lat, lon;
+        if (hasGPS && b.lat !== undefined && b.lon !== undefined) {
+            lat = b.lat;
+            lon = b.lon;
+        } else {
+            // Fallback: spread around zone center
+            const zd = zones[aid >= 0 && aid < zones.length ? aid : 0];
+            const zBuses = allBuses.filter(bb => findAgent(bb.id) === aid);
+            const zi = zBuses.indexOf(b);
+            const a = (zi / Math.max(zBuses.length, 1)) * Math.PI * 2;
+            lat = zd.lat + Math.cos(a) * 0.3;
+            lon = zd.lon + Math.sin(a) * 0.3;
+        }
+        busPositions[b.id] = {lat, lon, bus: b, agent: aid};
+    });
+    // Draw transmission lines
+    const drawnLines = new Set();
+    for (const obs of Object.values(state.observations)) {
+        (obs.internal_lines||[]).concat(obs.boundary_lines||[]).forEach(l => {
+            if (drawnLines.has(l.id)) return;
+            drawnLines.add(l.id);
+            const parts = l.id.replace('L_','').split('_');
+            const fromId = parseInt(parts[0]);
+            const toId = parseInt(parts[1]);
+            const from = busPositions[fromId];
+            const to = busPositions[toId];
+            if (!from || !to) return;
+            const lc = !l.connected ? '#4a5568' : l.rho > 1 ? '#ff1744' : l.rho > 0.8 ? '#ff9100' : '#e91e63';
+            const w = !l.connected ? 1.5 : l.rho > 0.8 ? 5 : 3;
+            const polyline = L.polyline(
+                [[from.lat, from.lon], [to.lat, to.lon]],
+                { color: lc, weight: w, dashArray: l.connected ? '10 5' : '4 4', opacity: 0.9 }
+            );
+            if (l.connected && Math.abs(l.flow) > 0.5) {
+                polyline.bindTooltip(`${l.id}: ${l.flow.toFixed(0)} MW (${(l.rho*100).toFixed(0)}%)`, {
+                    permanent: false, className: 'leaflet-tooltip-dark'
+                });
+            }
+            mapLayers.lines.addLayer(polyline);
+        });
+    }
+    // Ensure lines are visible above tiles
+    if (drawnLines.size > 0) {
+        mapLayers.lines.eachLayer(l => { if (l.bringToFront) l.bringToFront(); });
+    }
+    // Draw bus markers
+    for (const [bid, pos] of Object.entries(busPositions)) {
+        const b = pos.bus;
+        const col = AGENT_COLORS[pos.agent] || '#4a5568';
+        const fill = typeColors[b.type] || '#666';
+        const r = b.type === 'slack' ? 12 : b.type === 'load' ? 7 : 9;
+        const inj = (b.p_injection !== undefined ? b.p_injection : 0);
+        const busLabel = b.name || `${b.type} ${b.id}`;
+        const icon = typeIcons[b.type] || '?';
+        // Outer ring (zone color)
+        const outerRing = L.circleMarker([pos.lat, pos.lon], {
+            radius: r + 4, fillColor: 'transparent', fillOpacity: 0,
+            color: col, weight: 1.5, opacity: 0.4
+        });
+        mapLayers.nodes.addLayer(outerRing);
+        // Inner node
+        const marker = L.circleMarker([pos.lat, pos.lon], {
+            radius: r, fillColor: fill, fillOpacity: 0.9,
+            color: col, weight: 1, opacity: 0.6
+        });
+        // Rich tooltip
+        const tooltipHtml = `
+            <div style="font-family:'JetBrains Mono',monospace;font-size:11px;min-width:120px;">
+                <b style="color:${fill}">${icon}</b> <b>${busLabel}</b><br>
+                <span style="color:#888">Type:</span> ${b.type}<br>
+                <span style="color:#888">Injection:</span> <b>${inj.toFixed(1)} MW</b><br>
+                <span style="color:#888">Zone:</span> ${state.zoneInfo[String(pos.agent)]?.zone_name || 'Agent ' + pos.agent}
+            </div>`;
+        marker.bindTooltip(tooltipHtml, { className: 'leaflet-tooltip-dark', direction: 'top', offset: [0, -r] });
+        mapLayers.nodes.addLayer(marker);
+        // Label under node
+        const labelIcon = L.divIcon({
+            className: 'bus-label-icon',
+            html: `<span style="color:${fill};text-shadow:0 0 4px #000;font-size:9px;font-family:'JetBrains Mono',monospace;white-space:nowrap;">${busLabel}</span>`,
+            iconSize: [80, 14],
+            iconAnchor: [40, -r - 2],
+        });
+        L.marker([pos.lat, pos.lon], { icon: labelIcon, interactive: false }).addTo(mapLayers.nodes);
+        // MW label above node
+        const mwIcon = L.divIcon({
+            className: 'bus-mw-icon',
+            html: `<span style="color:#e0e0e0;text-shadow:0 0 4px #000;font-size:10px;font-weight:700;font-family:'JetBrains Mono',monospace;">${inj.toFixed(0)}</span>`,
+            iconSize: [40, 14],
+            iconAnchor: [20, r + 16],
+        });
+        L.marker([pos.lat, pos.lon], { icon: mwIcon, interactive: false }).addTo(mapLayers.nodes);
+    }
+    // Zone badge overlays
+    zones.slice(0, state.numAgents).forEach(z => {
+        const zi = state.zoneInfo[String(z.id)] || {};
+        const name = zi.zone_name || z.label || AGENT_NAMES[z.id];
+        const cum = (state.perAgentRewards[z.id] || []).reduce((a, b) => a + b, 0);
+        const badgeIcon = L.divIcon({
+            className: 'zone-badge-leaflet',
+            html: `<div style="background:rgba(10,14,26,0.85);border:1px solid ${z.color};border-radius:6px;padding:4px 10px;text-align:center;white-space:nowrap;">
+                <div style="color:${z.color};font-size:11px;font-weight:700;font-family:'JetBrains Mono',monospace;">${name}</div>
+                <div style="color:${z.color};font-size:10px;font-family:'JetBrains Mono',monospace;opacity:0.8">${cum.toFixed(1)} pts</div>
+            </div>`,
+            iconSize: [120, 36],
+            iconAnchor: [60, 50],
+        });
+        L.marker([z.lat, z.lon], { icon: badgeIcon, interactive: false }).addTo(mapLayers.badges);
+    });
+    // Fit map to bus extent on first data load
+    if (!mapFitted && allBuses.length > 0) {
+        const lats = allBuses.filter(b => b.lat).map(b => b.lat);
+        const lons = allBuses.filter(b => b.lon).map(b => b.lon);
+        if (lats.length > 0) {
+            leafletMap.fitBounds([
+                [Math.min(...lats) - 0.5, Math.min(...lons) - 0.5],
+                [Math.max(...lats) + 0.5, Math.max(...lons) + 0.5]
+            ]);
+            mapFitted = true;
+        }
+    }
+}
+function showBusTooltip(e, node) {
+    const tt = document.getElementById('busTooltip');
+    const zi = state.zoneInfo[node.dataset.agent]||{};
+    document.getElementById('ttTitle').textContent = `Bus ${node.dataset.bus} (${node.dataset.type})`;
+    document.getElementById('ttType').textContent = node.dataset.type;
+    document.getElementById('ttInj').textContent = node.dataset.inj + ' MW';
+    document.getElementById('ttZone').textContent = zi.zone_name || 'Zone ' + node.dataset.agent;
+    tt.style.left = (e.clientX + 12) + 'px';
+    tt.style.top = (e.clientY - 20) + 'px';
+    tt.classList.add('visible');
+}
+function hideBusTooltip() { document.getElementById('busTooltip').classList.remove('visible'); }
+function findAgent(busId) {
+    for (const [aid, zi] of Object.entries(state.zoneInfo)) {
+        if ((zi.bus_ids||[]).includes(busId)) return parseInt(aid);
+    }
+    return -1;
+}
+// --- Charts ---
+function drawSparkline(id, data, color) {
+    const el = document.getElementById(id);
+    if (!el || !data.length) return;
+    const w = el.clientWidth||120, h = el.clientHeight||22;
+    const min = Math.min(...data), max = Math.max(...data);
+    const range = max-min || 1;
+    const pts = data.slice(-30).map((v,i,a) => `${(i/(a.length-1||1))*w},${h-(((v-min)/range)*h*0.8+h*0.1)}`).join(' ');
+    el.innerHTML = `<polyline points="${pts}" fill="none" stroke="${color}" stroke-width="1.5" opacity="0.8"/>`;
+}
+function updateCharts() {
+    // Reward chart
+    drawChart('rewardChart', state.rewardHistory, 'var(--chart-reward)', 'Reward');
+    // Frequency chart
+    drawChart('freqChart', state.freqHistory, 'var(--chart-supply)', 'Hz', 49, 51);
+}
+function drawChart(containerId, data, color, label, fixedMin, fixedMax) {
+    const el = document.getElementById(containerId);
+    if (!el) return;
+    const W = el.clientWidth||300, H = el.clientHeight||140;
+    if (!data.length) { el.innerHTML = `<svg viewBox="0 0 ${W} ${H}"><text x="${W/2}" y="${H/2}" text-anchor="middle" fill="var(--text-muted)" font-size="11">Waiting for data...</text></svg>`; return; }
+    const pad = {t:10,r:10,b:20,l:40};
+    const cw = W-pad.l-pad.r, ch = H-pad.t-pad.b;
+    const min = fixedMin !== undefined ? fixedMin : Math.min(...data);
+    const max = fixedMax !== undefined ? fixedMax : Math.max(...data);
+    const range = max-min||1;
+    const pts = data.map((v,i) => `${pad.l+(i/(data.length-1||1))*cw},${pad.t+ch-(((v-min)/range)*ch)}`).join(' ');
+    let svg = `<svg viewBox="0 0 ${W} ${H}" xmlns="http://www.w3.org/2000/svg">`;
+    // Grid lines
+    for(let i=0;i<=4;i++){const y=pad.t+ch*i/4;const v=(max-((max-min)*i/4)).toFixed(1);svg+=`<line x1="${pad.l}" y1="${y}" x2="${W-pad.r}" y2="${y}" stroke="rgba(255,255,255,0.05)"/><text x="${pad.l-4}" y="${y+3}" text-anchor="end" fill="var(--text-muted)" font-size="8" font-family="JetBrains Mono">${v}</text>`;}
+    svg += `<polyline points="${pts}" fill="none" stroke="${color}" stroke-width="1.5"/>`;
+    // Fill area
+    const firstX = pad.l, lastX = pad.l+(data.length-1)/(data.length-1||1)*cw;
+    svg += `<polygon points="${pts} ${lastX},${pad.t+ch} ${firstX},${pad.t+ch}" fill="${color}" opacity="0.08"/>`;
+    svg += '</svg>';
+    el.innerHTML = svg;
+    // Gen mix chart
+    if (containerId === 'freqChart') updateGenMix();
+}
+function updateGenMix() {
+    const el = document.getElementById('genMixChart');
+    if (!el) return;
+    const W = el.clientWidth||200, H = el.clientHeight||140;
+    let types = {};
+    for (const obs of Object.values(state.observations)) {
+        (obs.local_buses||[]).forEach(b => {
+            if (b.p_injection > 0) types[b.type] = (types[b.type]||0) + b.p_injection;
+        });
+    }
+    const total = Object.values(types).reduce((a,b)=>a+b,0) || 1;
+    const colors = {slack:'#00e5a0',generator:'#f5a623',solar:'#ffeb3b',wind:'#64ffda',battery:'#4a90d9'};
+    let svg = `<svg viewBox="0 0 ${W} ${H}">`;
+    const cx=W/2, cy=H/2-5, r=Math.min(W,H)*0.3;
+    let startAngle = -Math.PI/2;
+    for (const [type, val] of Object.entries(types)) {
+        const pct = val/total;
+        const endAngle = startAngle + pct * Math.PI*2;
+        const x1=cx+r*Math.cos(startAngle), y1=cy+r*Math.sin(startAngle);
+        const x2=cx+r*Math.cos(endAngle), y2=cy+r*Math.sin(endAngle);
+        const large = pct > 0.5 ? 1 : 0;
+        svg += `<path d="M${cx},${cy} L${x1},${y1} A${r},${r} 0 ${large},1 ${x2},${y2} Z" fill="${colors[type]||'#666'}" opacity="0.8"/>`;
+        const mid = (startAngle+endAngle)/2;
+        if (pct > 0.08) {
+            const lx=cx+(r+14)*Math.cos(mid), ly=cy+(r+14)*Math.sin(mid);
+            svg += `<text x="${lx}" y="${ly}" text-anchor="middle" fill="var(--text-secondary)" font-size="8">${type} ${(pct*100).toFixed(0)}%</text>`;
+        }
+        startAngle = endAngle;
+    }
+    svg += `<circle cx="${cx}" cy="${cy}" r="${r*0.55}" fill="var(--bg-card)"/>`;
+    svg += `<text x="${cx}" y="${cy-2}" text-anchor="middle" fill="var(--text-primary)" font-family="JetBrains Mono" font-size="14" font-weight="700">${total.toFixed(0)}</text>`;
+    svg += `<text x="${cx}" y="${cy+10}" text-anchor="middle" fill="var(--text-muted)" font-size="8">MW</text>`;
+    svg += '</svg>';
+    el.innerHTML = svg;
+}
+// --- Alerts ---
+function showAlert(type, msg) {
+    const el = document.getElementById('alertBanner');
+    el.className = `alert-banner ${type} visible`;
+    document.getElementById('alertText').textContent = msg;
+    setTimeout(() => el.classList.remove('visible'), 5000);
+}
+function dismissAlert() { document.getElementById('alertBanner').classList.remove('visible'); }
+// --- Map Controls ---
+function zoomMap(factor) { state.mapScale *= factor; updateGridMap(); }
+function resetMapView() { state.mapScale = 1; updateGridMap(); }

static/index.html ADDED Viewed

	@@ -0,0 +1,225 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <meta name="description" content="OpenGrid — Multi-Agent POMDP Power Grid Control Room with Safe RL">
+    <title>OpenGrid | Control Room</title>
+    <link rel="stylesheet" href="/static/style.css">
+    <link rel="stylesheet" href="https://unpkg.com/leaflet@1.9.4/dist/leaflet.css" />
+    <script src="https://unpkg.com/leaflet@1.9.4/dist/leaflet.js"></script>
+    <link rel="icon" href="/static/logo.png" type="image/png">
+</head>
+<body>
+<!-- Loading Overlay -->
+<div class="loading-overlay" id="loading">
+    <div class="loading-spinner"></div>
+    <div class="loading-text">OpenGrid — Initializing Control Room</div>
+</div>
+<!-- Alert Banner -->
+<div class="alert-banner" id="alertBanner">
+    <span id="alertText"></span>
+    <button class="dismiss" onclick="dismissAlert()">Dismiss</button>
+</div>
+<!-- Main Layout -->
+<div class="control-room">
+    <!-- ===== HEADER ===== -->
+    <header class="header">
+        <div class="header-brand">
+            <img src="/static/logo.png" alt="OpenGrid" class="logo-img" style="width:32px;height:32px;border-radius:6px;">
+            <div>
+                <h1>OpenGrid</h1>
+                <div class="sub">Multi-Agent Power Grid Control Room</div>
+            </div>
+        </div>
+        <div class="sim-badge">
+            <span class="dot"></span>
+            <span id="simStatus">READY</span>
+        </div>
+        <div class="header-stats">
+            <div class="header-stat">
+                <span class="label">Episode</span>
+                <span class="value normal" id="headerEpisode">--</span>
+            </div>
+            <div class="header-stat">
+                <span class="label">Step</span>
+                <span class="value" id="headerStep">0 / 50</span>
+            </div>
+            <div class="header-stat">
+                <span class="label">Frequency</span>
+                <span class="value normal" id="headerFreq">50.00 Hz</span>
+            </div>
+            <div class="header-stat">
+                <span class="label">Agents</span>
+                <span class="value normal" id="headerAgents">--</span>
+            </div>
+            <div class="header-stat">
+                <span class="label">Team Reward</span>
+                <span class="value" id="headerReward">0.00</span>
+            </div>
+        </div>
+    </header>
+    <!-- ===== LEFT PANEL ===== -->
+    <aside class="left-panel">
+        <!-- Frequency Display -->
+        <div class="card">
+            <div class="card-title">Grid Frequency</div>
+            <div class="freq-display">
+                <div class="freq-arc-container" id="freqArc"></div>
+                <div class="freq-deviation" id="freqDev">Deviation: 0.00 Hz | Nominal: 50.00 Hz</div>
+                <div class="grid-condition normal" id="gridCondition">NORMAL</div>
+            </div>
+        </div>
+        <!-- System Summary -->
+        <div class="card">
+            <div class="card-title">System Summary</div>
+            <div class="stat-row highlight">
+                <span class="label">Total Generation</span>
+                <span class="value" id="totalGen">-- MW</span>
+            </div>
+            <div class="stat-row">
+                <span class="label">Total Load</span>
+                <span class="value" id="totalLoad">-- MW</span>
+            </div>
+            <div class="stat-row">
+                <span class="label">Net Balance</span>
+                <span class="value" id="netBalance">-- MW</span>
+            </div>
+            <div class="stat-row">
+                <span class="label">Lines Connected</span>
+                <span class="value" id="linesConnected">--</span>
+            </div>
+            <div class="stat-row">
+                <span class="label">Lines Overloaded</span>
+                <span class="value" id="linesOverloaded" style="color: var(--status-normal);">0</span>
+            </div>
+        </div>
+        <!-- Coordination -->
+        <div class="card">
+            <div class="card-title">Oversight Agent</div>
+            <div class="coord-score">
+                <div class="big-value" id="coordScore" style="color: var(--status-normal);">1.00</div>
+                <div style="font-size:10px; color: var(--text-secondary); margin-top:4px;">Coordination Score</div>
+            </div>
+            <div class="stat-row">
+                <span class="label">Conflicts</span>
+                <span class="value" id="conflicts">0</span>
+            </div>
+            <div class="stat-row">
+                <span class="label">Safety Corrections</span>
+                <span class="value" id="safetyCorrTotal">0</span>
+            </div>
+            <div class="stat-row">
+                <span class="label">Selfish Actions</span>
+                <span class="value" id="selfishActions">0</span>
+            </div>
+        </div>
+        <!-- Exception Log -->
+        <div class="card" style="flex:1; display:flex; flex-direction:column; overflow:hidden;">
+            <div class="card-title" style="color: var(--status-warning);">Exception Log</div>
+            <div class="alarm-log" id="alarmLog">
+                <!-- Populated by JS -->
+            </div>
+        </div>
+        <!-- Task Selector -->
+        <div class="card" style="flex-shrink:0;">
+            <div class="card-title">Task &amp; Controls</div>
+            <div class="task-selector" id="taskSelector">
+                <button class="task-btn" data-task="task_easy">Easy</button>
+                <button class="task-btn" data-task="task_medium">Medium</button>
+                <button class="task-btn" data-task="task_hard">Hard</button>
+                <button class="task-btn active" data-task="task_karnataka" style="color: #ffeb3b; border-color: rgba(255,235,59,0.3);">Karnataka</button>
+            </div>
+            <div class="controls-row" style="margin-top: var(--gap-sm);">
+                <button class="ctrl-btn active" id="btnReset" onclick="resetEpisode()">Reset</button>
+                <button class="ctrl-btn" id="btnStep" onclick="stepEpisode()">Step</button>
+                <button class="ctrl-btn" id="btnAutoRun" onclick="toggleAutoRun()">Auto</button>
+            </div>
+        </div>
+    </aside>
+    <!-- ===== CENTER PANEL (Grid Map) ===== -->
+    <main class="center-panel" id="centerPanel">
+        <div class="grid-map" id="gridMap"></div>
+        <div class="bus-tooltip" id="busTooltip">
+            <div class="tt-title" id="ttTitle">Bus 0</div>
+            <div class="tt-row"><span>Type</span><span class="tt-val" id="ttType">--</span></div>
+            <div class="tt-row"><span>Injection</span><span class="tt-val" id="ttInj">-- MW</span></div>
+            <div class="tt-row"><span>Zone</span><span class="tt-val" id="ttZone">--</span></div>
+        </div>
+    </main>
+    <!-- ===== RIGHT PANEL (Agent Monitor) ===== -->
+    <aside class="right-panel">
+        <div class="card">
+            <div class="card-title">Agent Leaderboard</div>
+            <ul class="leaderboard" id="leaderboard">
+                <!-- Populated by JS -->
+            </ul>
+        </div>
+        <div id="agentCards">
+            <!-- Populated by JS -->
+        </div>
+    </aside>
+    <!-- ===== BOTTOM PANEL ===== -->
+    <footer class="bottom-panel">
+        <!-- Reward History Chart -->
+        <div class="bottom-card">
+            <div class="card-title">Reward History</div>
+            <div class="chart-area" id="rewardChart"></div>
+        </div>
+        <!-- Frequency Trend -->
+        <div class="bottom-card">
+            <div class="card-title">Frequency Trend</div>
+            <div class="chart-area" id="freqChart"></div>
+        </div>
+        <!-- Generation Mix -->
+        <div class="bottom-card">
+            <div class="card-title">Generation Mix</div>
+            <div class="chart-area" id="genMixChart"></div>
+        </div>
+        <!-- Episode Score -->
+        <div class="bottom-card">
+            <div class="card-title">Episode Score</div>
+            <div class="coord-score" style="flex:1; display:flex; flex-direction:column; justify-content:center;">
+                <div class="big-value" id="episodeScore" style="color: var(--chart-reward); font-size: 36px;">--</div>
+                <div style="font-size:10px; color: var(--text-secondary); margin-top:4px;">Grader Score</div>
+                <div style="font-size:11px; margin-top:8px;">
+                    <span style="color: var(--text-secondary);">Steps:</span>
+                    <span id="totalSteps" style="font-family: 'JetBrains Mono'; font-weight:600;">0</span>
+                    <span style="color: var(--text-secondary); margin-left:8px;">Blackout:</span>
+                    <span id="blackoutStatus" style="font-family: 'JetBrains Mono'; font-weight:600; color: var(--status-normal);">No</span>
+                </div>
+            </div>
+            <div class="controls-row">
+                <button class="ctrl-btn" onclick="getGrade()">Grade</button>
+                <button class="ctrl-btn danger" onclick="resetEpisode()">New Episode</button>
+            </div>
+        </div>
+    </footer>
+</div>
+<script src="/static/app.js"></script>
+</body>
+</html>

static/karnataka.svg ADDED Viewed

static/logo.png ADDED Viewed

Git LFS Details

SHA256: 7c5b33163678b884123740782fbaab4bafba3d02e4a2a36ec1ae4e138af31915
Pointer size: 129 Bytes
Size of remote file: 1.37 kB

static/style.css ADDED Viewed

	@@ -0,0 +1,935 @@

+/* ============================================================================
+   OpenGrid KPTCL-SLDC Control Room — Design System
+   Inspired by ERCOT control room aesthetics, adapted for Karnataka grid
+   ============================================================================ */
+@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&family=JetBrains+Mono:wght@400;500;600;700&display=swap');
+/* ---------- CSS Custom Properties ---------- */
+:root {
+    /* Background layers */
+    --bg-primary:     #0a0e1a;
+    --bg-secondary:   #0f1628;
+    --bg-tertiary:    #141d35;
+    --bg-glass:       rgba(15, 22, 40, 0.85);
+    --bg-card:        rgba(15, 22, 40, 0.7);
+    /* Operational states */
+    --status-normal:  #00e5a0;
+    --status-warning: #ffd700;
+    --status-critical:#ff3d3d;
+    --status-offline: #4a5568;
+    --status-overload:#ff6b35;
+    /* Voltage colors */
+    --voltage-400kv:  #e94560;
+    --voltage-220kv:  #f5a623;
+    --voltage-110kv:  #7ed321;
+    --voltage-66kv:   #4a90d9;
+    /* Agent identity colors */
+    --agent-0: #00bfff;
+    --agent-1: #ff69b4;
+    --agent-2: #ff6347;
+    /* Text */
+    --text-primary:   #e8eaf6;
+    --text-secondary: #90a4ae;
+    --text-accent:    #00e5a0;
+    --text-danger:    #ff5252;
+    --text-muted:     #546e7a;
+    /* Chart */
+    --chart-demand:   #00bfff;
+    --chart-supply:   #00e5a0;
+    --chart-reward:   #ffd700;
+    /* Spacing */
+    --gap-sm: 8px;
+    --gap-md: 12px;
+    --gap-lg: 16px;
+    --gap-xl: 20px;
+    /* Radius */
+    --radius-sm: 6px;
+    --radius-md: 10px;
+    --radius-lg: 14px;
+}
+/* ---------- Reset & Base ---------- */
+*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
+html, body {
+    height: 100%;
+    background: var(--bg-primary);
+    color: var(--text-primary);
+    font-family: 'Inter', 'Segoe UI', sans-serif;
+    font-size: 13px;
+    line-height: 1.5;
+    overflow: hidden;
+    -webkit-font-smoothing: antialiased;
+}
+/* Subtle scanline overlay */
+body::before {
+    content: '';
+    position: fixed;
+    top: 0; left: 0; right: 0; bottom: 0;
+    pointer-events: none;
+    z-index: 9999;
+    background: repeating-linear-gradient(
+        0deg,
+        transparent,
+        transparent 2px,
+        rgba(0,0,0,0.03) 2px,
+        rgba(0,0,0,0.03) 4px
+    );
+}
+/* ---------- Layout ---------- */
+.control-room {
+    display: grid;
+    grid-template-rows: 52px 1fr 180px;
+    grid-template-columns: 260px 1fr 300px;
+    grid-template-areas:
+        "header  header  header"
+        "left    center  right"
+        "bottom  bottom  bottom";
+    height: 100vh;
+    gap: 1px;
+    background: rgba(255,255,255,0.04);
+}
+/* ---------- Header ---------- */
+.header {
+    grid-area: header;
+    background: linear-gradient(90deg, #0a0e1a, #0f2040);
+    display: flex;
+    align-items: center;
+    padding: 0 var(--gap-lg);
+    gap: var(--gap-lg);
+    border-bottom: 1px solid rgba(0,229,160,0.15);
+    z-index: 10;
+}
+.header-brand {
+    display: flex;
+    align-items: center;
+    gap: var(--gap-sm);
+    flex-shrink: 0;
+}
+.header-brand .logo {
+    width: 28px;
+    height: 28px;
+    background: linear-gradient(135deg, #00e5a0, #00bfff);
+    border-radius: 6px;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    font-weight: 700;
+    font-size: 14px;
+    color: #0a0e1a;
+}
+.header-brand h1 {
+    font-size: 14px;
+    font-weight: 600;
+    letter-spacing: 0.5px;
+}
+.header-brand .sub {
+    font-size: 10px;
+    color: var(--text-secondary);
+    letter-spacing: 1px;
+    text-transform: uppercase;
+}
+.header-stats {
+    display: flex;
+    gap: var(--gap-lg);
+    margin-left: auto;
+    align-items: center;
+}
+.header-stat {
+    display: flex;
+    flex-direction: column;
+    align-items: center;
+    padding: 4px 12px;
+    border-radius: var(--radius-sm);
+    background: rgba(255,255,255,0.04);
+}
+.header-stat .label {
+    font-size: 9px;
+    text-transform: uppercase;
+    letter-spacing: 1px;
+    color: var(--text-secondary);
+}
+.header-stat .value {
+    font-family: 'JetBrains Mono', monospace;
+    font-size: 14px;
+    font-weight: 600;
+}
+.header-stat .value.normal { color: var(--status-normal); }
+.header-stat .value.warning { color: var(--status-warning); }
+.header-stat .value.critical { color: var(--status-critical); }
+.sim-badge {
+    display: flex;
+    align-items: center;
+    gap: 6px;
+    padding: 4px 10px;
+    border-radius: 20px;
+    background: rgba(0,229,160,0.1);
+    border: 1px solid rgba(0,229,160,0.25);
+    font-size: 10px;
+    font-weight: 600;
+    color: var(--status-normal);
+    text-transform: uppercase;
+    letter-spacing: 1px;
+}
+.sim-badge .dot {
+    width: 6px; height: 6px;
+    background: var(--status-normal);
+    border-radius: 50%;
+    animation: pulse-dot 2s infinite;
+}
+@keyframes pulse-dot {
+    0%, 100% { opacity: 1; box-shadow: 0 0 0 0 rgba(0,229,160,0.4); }
+    50% { opacity: 0.7; box-shadow: 0 0 0 4px rgba(0,229,160,0); }
+}
+/* ---------- Left Panel ---------- */
+.left-panel {
+    grid-area: left;
+    background: var(--bg-secondary);
+    padding: var(--gap-md);
+    overflow-y: auto;
+    display: flex;
+    flex-direction: column;
+    gap: var(--gap-md);
+    border-right: 1px solid rgba(255,255,255,0.05);
+}
+/* ---------- Cards (shared) ---------- */
+.card {
+    background: var(--bg-card);
+    border: 1px solid rgba(255,255,255,0.06);
+    border-radius: var(--radius-md);
+    padding: var(--gap-md);
+    backdrop-filter: blur(8px);
+}
+.card-title {
+    font-size: 10px;
+    font-weight: 600;
+    text-transform: uppercase;
+    letter-spacing: 1.5px;
+    color: var(--text-secondary);
+    margin-bottom: var(--gap-sm);
+    padding-bottom: 6px;
+    border-bottom: 1px solid rgba(255,255,255,0.06);
+}
+/* ---------- Alarm Log ---------- */
+.alarm-log {
+    flex: 1;
+    max-height: 90px;
+    overflow-y: auto;
+    font-family: 'JetBrains Mono', monospace;
+    font-size: 10px;
+    line-height: 1.4;
+    display: flex;
+    flex-direction: column;
+    gap: 4px;
+}
+.alarm-entry {
+    padding: 4px 6px;
+    background: rgba(255,255,255,0.03);
+    border-left: 2px solid transparent;
+    border-radius: 2px;
+}
+.alarm-time { color: var(--text-muted); margin-right: 6px; }
+.alarm-entry.warn { border-left-color: var(--status-warning); background: rgba(255,152,0,0.05); color: #ffb74d; }
+.alarm-entry.crit { border-left-color: var(--status-critical); background: rgba(244,67,54,0.05); color: #ef5350; }
+.alarm-entry.info { border-left-color: var(--status-normal); }
+/* ---------- Frequency Display ---------- */
+.freq-display {
+    text-align: center;
+    padding: var(--gap-md) var(--gap-sm);
+}
+.freq-arc-container {
+    position: relative;
+    width: 200px;
+    height: 110px;
+    margin: 0 auto;
+}
+.freq-arc-container svg { overflow: visible; }
+.freq-value {
+    font-family: 'JetBrains Mono', monospace;
+    font-size: 32px;
+    font-weight: 700;
+    letter-spacing: -1px;
+    transition: color 0.3s;
+}
+.freq-value.normal { color: var(--status-normal); text-shadow: 0 0 20px rgba(0,229,160,0.3); }
+.freq-value.warning { color: var(--status-warning); text-shadow: 0 0 20px rgba(255,215,0,0.3); }
+.freq-value.critical { color: var(--status-critical); text-shadow: 0 0 20px rgba(255,61,61,0.3); animation: freq-blink 0.5s infinite; }
+@keyframes freq-blink {
+    0%, 100% { opacity: 1; }
+    50% { opacity: 0.6; }
+}
+.freq-deviation {
+    margin-top: 4px;
+    font-family: 'JetBrains Mono', monospace;
+    font-size: 10px;
+    color: var(--text-secondary);
+}
+/* Grid condition badge */
+.grid-condition {
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    gap: 6px;
+    margin-top: var(--gap-sm);
+    padding: 5px 10px;
+    border-radius: 20px;
+    font-size: 10px;
+    font-weight: 600;
+    text-transform: uppercase;
+    letter-spacing: 0.8px;
+}
+.grid-condition.normal { background: rgba(0,229,160,0.1); color: var(--status-normal); border: 1px solid rgba(0,229,160,0.2); }
+.grid-condition.conservative { background: rgba(255,215,0,0.08); color: var(--status-warning); border: 1px solid rgba(255,215,0,0.15); }
+.grid-condition.alert { background: rgba(255,107,53,0.1); color: var(--status-overload); border: 1px solid rgba(255,107,53,0.2); }
+.grid-condition.emergency { background: rgba(255,61,61,0.1); color: var(--status-critical); border: 1px solid rgba(255,61,61,0.2); animation: cond-pulse 1s infinite; }
+@keyframes cond-pulse {
+    0%,100% { box-shadow: 0 0 0 0 rgba(255,61,61,0.2); }
+    50% { box-shadow: 0 0 0 4px rgba(255,61,61,0); }
+}
+/* ---------- System Summary ---------- */
+.stat-row {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    padding: 4px 0;
+    font-size: 12px;
+}
+.stat-row .label { color: var(--text-secondary); }
+.stat-row .value {
+    font-family: 'JetBrains Mono', monospace;
+    font-weight: 500;
+}
+.stat-row.highlight .value {
+    color: var(--status-normal);
+    font-weight: 600;
+}
+/* Progress bars */
+.progress-bar {
+    height: 4px;
+    background: rgba(255,255,255,0.06);
+    border-radius: 2px;
+    overflow: hidden;
+    margin-top: 4px;
+}
+.progress-bar-fill {
+    height: 100%;
+    border-radius: 2px;
+    transition: width 0.5s;
+}
+/* ---------- Center Panel (Grid Map) ---------- */
+.center-panel {
+    grid-area: center;
+    background: var(--bg-tertiary);
+    position: relative;
+    overflow: hidden;
+}
+.grid-map {
+    width: 100%;
+    height: 100%;
+}
+.grid-map svg {
+    width: 100%;
+    height: 100%;
+}
+/* SVG map styles */
+.zone-polygon {
+    opacity: 0.06;
+    transition: opacity 0.4s;
+    cursor: pointer;
+    filter: blur(0.5px);
+}
+.zone-polygon:hover { opacity: 0.18; }
+.substation-node { cursor: pointer; }
+.substation-node:hover .node-outer { stroke-width: 2.5; filter: url(#glow); }
+.substation-node:hover .node-label { opacity: 1; }
+.node-label {
+    font-family: 'Inter', sans-serif;
+    font-size: 8px;
+    fill: var(--text-secondary);
+    text-anchor: middle;
+    pointer-events: none;
+    opacity: 0.7;
+    transition: opacity 0.2s;
+}
+.node-mw {
+    font-family: 'JetBrains Mono', monospace;
+    font-size: 9px;
+    fill: var(--text-primary);
+    text-anchor: middle;
+    pointer-events: none;
+    font-weight: 500;
+}
+.line-flow {
+    fill: none;
+    stroke-linecap: round;
+}
+/* Animated flow on lines */
+@keyframes dash-flow {
+    to { stroke-dashoffset: -24; }
+}
+.line-animated {
+    animation: dash-flow 1.2s linear infinite;
+}
+.line-animated.reverse {
+    animation-direction: reverse;
+}
+.flow-label {
+    font-family: 'JetBrains Mono', monospace;
+    font-size: 8px;
+    fill: rgba(232,234,246,0.6);
+    text-anchor: middle;
+    pointer-events: none;
+}
+.zone-badge { font-family: 'Inter', sans-serif; pointer-events: none; }
+.zone-badge-bg {
+    rx: 8;
+    fill: rgba(10, 14, 26, 0.88);
+    stroke-width: 1;
+    backdrop-filter: blur(6px);
+}
+.zone-badge-name { font-size: 10px; font-weight: 600; text-anchor: middle; }
+.zone-badge-status { font-size: 8px; text-anchor: middle; fill: var(--text-secondary); }
+.zone-badge-reward { font-size: 9px; text-anchor: middle; font-weight: 600; font-family: 'JetBrains Mono', monospace; }
+/* Bus tooltip */
+.bus-tooltip {
+    position: absolute;
+    background: rgba(10, 14, 26, 0.95);
+    border: 1px solid rgba(0,229,160,0.2);
+    border-radius: var(--radius-sm);
+    padding: 8px 10px;
+    font-size: 11px;
+    pointer-events: none;
+    z-index: 20;
+    min-width: 140px;
+    backdrop-filter: blur(12px);
+    box-shadow: 0 4px 20px rgba(0,0,0,0.4);
+    display: none;
+}
+.bus-tooltip.visible { display: block; }
+.bus-tooltip .tt-title {
+    font-weight: 600;
+    margin-bottom: 4px;
+    padding-bottom: 4px;
+    border-bottom: 1px solid rgba(255,255,255,0.08);
+}
+.bus-tooltip .tt-row {
+    display: flex;
+    justify-content: space-between;
+    padding: 1px 0;
+}
+.bus-tooltip .tt-row .tt-val {
+    font-family: 'JetBrains Mono', monospace;
+    font-weight: 500;
+}
+/* Map overlay controls */
+.map-controls {
+    position: absolute;
+    top: var(--gap-md);
+    right: var(--gap-md);
+    display: flex;
+    flex-direction: column;
+    gap: 4px;
+    z-index: 5;
+}
+.map-btn {
+    width: 32px; height: 32px;
+    background: var(--bg-glass);
+    border: 1px solid rgba(255,255,255,0.1);
+    border-radius: var(--radius-sm);
+    color: var(--text-secondary);
+    font-size: 14px;
+    cursor: pointer;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    backdrop-filter: blur(8px);
+    transition: all 0.2s;
+}
+.map-btn:hover {
+    background: rgba(0,229,160,0.15);
+    color: var(--status-normal);
+    border-color: rgba(0,229,160,0.3);
+}
+/* ---------- Right Panel (Agent Monitor) ---------- */
+.right-panel {
+    grid-area: right;
+    background: var(--bg-secondary);
+    padding: var(--gap-md);
+    overflow-y: auto;
+    display: flex;
+    flex-direction: column;
+    gap: var(--gap-md);
+    border-left: 1px solid rgba(255,255,255,0.05);
+}
+/* Agent cards */
+.agent-card {
+    border-radius: var(--radius-md);
+    padding: var(--gap-md);
+    background: var(--bg-card);
+    border: 1px solid rgba(255,255,255,0.06);
+    backdrop-filter: blur(8px);
+    transition: border-color 0.3s, box-shadow 0.3s;
+}
+.agent-card.active {
+    border-color: rgba(0,229,160,0.2);
+}
+.agent-card.warning {
+    border-color: rgba(255,215,0,0.3);
+    box-shadow: 0 0 12px rgba(255,215,0,0.05);
+}
+.agent-card.critical {
+    border-color: rgba(255,61,61,0.3);
+    box-shadow: 0 0 12px rgba(255,61,61,0.08);
+    animation: card-pulse 1.5s infinite;
+}
+@keyframes card-pulse {
+    0%, 100% { box-shadow: 0 0 12px rgba(255,61,61,0.08); }
+    50% { box-shadow: 0 0 20px rgba(255,61,61,0.15); }
+}
+.agent-header {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    margin-bottom: var(--gap-sm);
+}
+.agent-name {
+    font-size: 12px;
+    font-weight: 600;
+    display: flex;
+    align-items: center;
+    gap: 6px;
+}
+.agent-dot {
+    width: 8px; height: 8px;
+    border-radius: 50%;
+    flex-shrink: 0;
+}
+.agent-status-badge {
+    font-size: 9px;
+    font-weight: 600;
+    padding: 2px 6px;
+    border-radius: 10px;
+    text-transform: uppercase;
+    letter-spacing: 0.5px;
+}
+.agent-status-badge.active {
+    background: rgba(0,229,160,0.15);
+    color: var(--status-normal);
+}
+.agent-status-badge.corrected {
+    background: rgba(255,215,0,0.15);
+    color: var(--status-warning);
+}
+.agent-metrics {
+    display: grid;
+    grid-template-columns: 1fr 1fr;
+    gap: 6px;
+    margin-top: var(--gap-sm);
+}
+.agent-metric {
+    padding: 6px 8px;
+    background: rgba(255,255,255,0.02);
+    border-radius: var(--radius-sm);
+}
+.agent-metric .label {
+    font-size: 9px;
+    text-transform: uppercase;
+    letter-spacing: 0.5px;
+    color: var(--text-muted);
+}
+.agent-metric .value {
+    font-family: 'JetBrains Mono', monospace;
+    font-size: 14px;
+    font-weight: 600;
+    margin-top: 2px;
+}
+/* Safety shield */
+.safety-shield {
+    margin-top: var(--gap-sm);
+    padding: 6px 8px;
+    border-radius: var(--radius-sm);
+    display: flex;
+    align-items: center;
+    gap: 6px;
+    font-size: 10px;
+    font-weight: 600;
+    text-transform: uppercase;
+    letter-spacing: 0.5px;
+}
+.safety-shield.safe {
+    background: rgba(0,229,160,0.08);
+    border: 1px solid rgba(0,229,160,0.15);
+    color: var(--status-normal);
+}
+.safety-shield.corrected {
+    background: rgba(255,215,0,0.08);
+    border: 1px solid rgba(255,215,0,0.2);
+    color: var(--status-warning);
+}
+.safety-shield.violated {
+    background: rgba(255,61,61,0.08);
+    border: 1px solid rgba(255,61,61,0.2);
+    color: var(--status-critical);
+}
+/* Sparkline */
+.sparkline-container {
+    margin-top: var(--gap-sm);
+    height: 30px;
+    background: rgba(255,255,255,0.02);
+    border-radius: var(--radius-sm);
+    padding: 4px;
+}
+.sparkline-container svg {
+    width: 100%;
+    height: 100%;
+}
+/* ---------- Bottom Panel ---------- */
+.bottom-panel {
+    grid-area: bottom;
+    background: var(--bg-secondary);
+    display: grid;
+    grid-template-columns: 2fr 1fr 1fr 1fr;
+    gap: 1px;
+    border-top: 1px solid rgba(255,255,255,0.05);
+}
+.bottom-card {
+    background: var(--bg-card);
+    padding: var(--gap-md);
+    display: flex;
+    flex-direction: column;
+}
+.chart-area {
+    flex: 1;
+    position: relative;
+    min-height: 0;
+}
+.chart-area canvas, .chart-area svg {
+    width: 100%;
+    height: 100%;
+}
+/* Reward chart */
+.reward-history {
+    flex: 1;
+}
+/* Controls */
+.controls-row {
+    display: flex;
+    gap: var(--gap-sm);
+    margin-top: var(--gap-sm);
+}
+.ctrl-btn {
+    flex: 1;
+    padding: 6px 10px;
+    background: rgba(255,255,255,0.04);
+    border: 1px solid rgba(255,255,255,0.1);
+    border-radius: var(--radius-sm);
+    color: var(--text-primary);
+    font-family: 'Inter', sans-serif;
+    font-size: 11px;
+    font-weight: 500;
+    cursor: pointer;
+    transition: all 0.2s;
+    text-align: center;
+}
+.ctrl-btn:hover {
+    background: rgba(0,229,160,0.1);
+    border-color: rgba(0,229,160,0.3);
+}
+.ctrl-btn.active {
+    background: rgba(0,229,160,0.15);
+    border-color: var(--status-normal);
+    color: var(--status-normal);
+}
+.ctrl-btn.danger {
+    border-color: rgba(255,61,61,0.3);
+}
+.ctrl-btn.danger:hover {
+    background: rgba(255,61,61,0.1);
+    border-color: rgba(255,61,61,0.5);
+    color: var(--status-critical);
+}
+/* Task selector */
+.task-selector {
+    display: flex;
+    gap: 4px;
+}
+.task-btn {
+    flex: 1;
+    padding: 4px 8px;
+    background: rgba(255,255,255,0.03);
+    border: 1px solid rgba(255,255,255,0.08);
+    border-radius: var(--radius-sm);
+    color: var(--text-secondary);
+    font-size: 10px;
+    font-weight: 500;
+    cursor: pointer;
+    transition: all 0.2s;
+    text-transform: uppercase;
+    letter-spacing: 0.5px;
+}
+.task-btn:hover { border-color: rgba(0,229,160,0.3); color: var(--text-primary); }
+.task-btn.active { background: rgba(0,229,160,0.1); border-color: var(--status-normal); color: var(--status-normal); }
+/* Leaderboard */
+.leaderboard {
+    list-style: none;
+}
+.leaderboard li {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    padding: 5px 0;
+    font-size: 11px;
+    border-bottom: 1px solid rgba(255,255,255,0.03);
+}
+.leaderboard li:last-child { border-bottom: none; }
+.leaderboard .agent-label {
+    display: flex;
+    align-items: center;
+    gap: 6px;
+}
+.leaderboard .score {
+    font-family: 'JetBrains Mono', monospace;
+    font-weight: 600;
+    font-size: 12px;
+}
+/* Coordination score */
+.coord-score {
+    text-align: center;
+    padding: var(--gap-sm);
+}
+.coord-score .big-value {
+    font-family: 'JetBrains Mono', monospace;
+    font-size: 28px;
+    font-weight: 700;
+}
+/* Alert banner */
+.alert-banner {
+    position: fixed;
+    top: 52px;
+    left: 0; right: 0;
+    z-index: 100;
+    padding: 8px var(--gap-lg);
+    display: flex;
+    align-items: center;
+    gap: var(--gap-sm);
+    font-size: 12px;
+    font-weight: 500;
+    transform: translateY(-100%);
+    transition: transform 0.3s;
+}
+.alert-banner.visible { transform: translateY(0); }
+.alert-banner.critical {
+    background: rgba(255,61,61,0.15);
+    border-bottom: 1px solid rgba(255,61,61,0.3);
+    color: var(--status-critical);
+}
+.alert-banner.warning {
+    background: rgba(255,215,0,0.1);
+    border-bottom: 1px solid rgba(255,215,0,0.2);
+    color: var(--status-warning);
+}
+.alert-banner .dismiss {
+    margin-left: auto;
+    background: none;
+    border: 1px solid currentColor;
+    border-radius: var(--radius-sm);
+    color: inherit;
+    padding: 2px 8px;
+    font-size: 10px;
+    cursor: pointer;
+    opacity: 0.7;
+}
+.alert-banner .dismiss:hover { opacity: 1; }
+/* Scrollbar */
+::-webkit-scrollbar { width: 4px; }
+::-webkit-scrollbar-track { background: transparent; }
+::-webkit-scrollbar-thumb { background: rgba(255,255,255,0.1); border-radius: 2px; }
+::-webkit-scrollbar-thumb:hover { background: rgba(255,255,255,0.2); }
+/* Loading state */
+.loading-overlay {
+    position: fixed;
+    top: 0; left: 0; right: 0; bottom: 0;
+    background: var(--bg-primary);
+    display: flex;
+    flex-direction: column;
+    align-items: center;
+    justify-content: center;
+    z-index: 1000;
+    transition: opacity 0.5s;
+}
+.loading-overlay.hidden {
+    opacity: 0;
+    pointer-events: none;
+}
+.loading-spinner {
+    width: 40px; height: 40px;
+    border: 3px solid rgba(0,229,160,0.15);
+    border-top-color: var(--status-normal);
+    border-radius: 50%;
+    animation: spin 0.8s linear infinite;
+}
+@keyframes spin { to { transform: rotate(360deg); } }
+.loading-text {
+    margin-top: var(--gap-md);
+    color: var(--text-secondary);
+    font-size: 12px;
+    letter-spacing: 2px;
+    text-transform: uppercase;
+}
+/* ── Leaflet Overrides ── */
+.grid-map .leaflet-container {
+    background: var(--bg-primary) !important;
+}
+.leaflet-tooltip-dark {
+    background: rgba(10, 14, 26, 0.92) !important;
+    border: 1px solid rgba(0, 229, 160, 0.3) !important;
+    color: #e0e0e0 !important;
+    font-family: 'JetBrains Mono', monospace !important;
+    font-size: 11px !important;
+    border-radius: 6px !important;
+    padding: 6px 10px !important;
+    box-shadow: 0 4px 20px rgba(0,0,0,0.6) !important;
+}
+.leaflet-tooltip-dark::before {
+    border-top-color: rgba(10, 14, 26, 0.92) !important;
+}
+.bus-label-icon, .bus-mw-icon, .zone-badge-leaflet {
+    background: none !important;
+    border: none !important;
+    text-align: center;
+}
+/* Dark zoom controls */
+.leaflet-control-zoom a {
+    background: rgba(15, 22, 40, 0.9) !important;
+    color: var(--status-normal) !important;
+    border-color: rgba(0, 229, 160, 0.2) !important;
+    font-family: 'JetBrains Mono', monospace !important;
+}
+.leaflet-control-zoom a:hover {
+    background: rgba(0, 229, 160, 0.15) !important;
+}
+.leaflet-control-attribution {
+    background: rgba(10, 14, 26, 0.6) !important;
+    color: #555 !important;
+    font-size: 9px !important;
+}
+.leaflet-control-attribution a {
+    color: #666 !important;
+}

tests/__init__.py ADDED Viewed

File without changes

tests/test_multi_agent.py ADDED Viewed

	@@ -0,0 +1,345 @@

+"""
+Tests for multi-agent POMDP features:
+- Zone assignment and partitioning
+- Partial observability (ZoneObservation)
+- Safety layer (action validation and correction)
+- Oversight agent (coordination monitoring)
+- Multi-agent step (combined pipeline)
+"""
+import copy
+import unittest
+import networkx as nx
+import numpy as np
+from src.environment import OpenGridEnv
+from src.tasks import TASKS
+from src.models import GridAction, BusAdjustment, TopologyAction, ZoneObservation
+from src.safety import SafetyLayer
+from src.oversight import OversightAgent
+def task(task_id: str):
+    """Get a deep-copied task config to prevent cross-test contamination."""
+    return copy.deepcopy(TASKS[task_id])
+class TestZoneAssignment(unittest.TestCase):
+    """Tests for multi-agent zone partitioning."""
+    def test_all_buses_assigned(self):
+        """Every bus should be assigned to exactly one zone."""
+        for task_id, config in TASKS.items():
+            zone_map = config['zone_assignments']
+            for i in range(config['num_buses']):
+                self.assertIn(i, zone_map, f"Bus {i} not assigned in {task_id}")
+    def test_zone_count_matches(self):
+        """Number of zones should match num_agents."""
+        for task_id, config in TASKS.items():
+            agents = set(config['zone_assignments'].values())
+            self.assertEqual(len(agents), config['num_agents'],
+                             f"Zone count mismatch in {task_id}")
+    def test_no_empty_zones(self):
+        """Each zone should have at least 1 bus."""
+        for task_id, config in TASKS.items():
+            for agent_id in range(config['num_agents']):
+                bus_ids = config['zone_bus_ids'][agent_id]
+                self.assertGreater(len(bus_ids), 0,
+                                   f"Empty zone {agent_id} in {task_id}")
+    def test_lines_classified(self):
+        """All lines should be classified as internal or boundary."""
+        for task_id, config in TASKS.items():
+            all_internal = set()
+            all_boundary = set()
+            for agent_id in range(config['num_agents']):
+                all_internal.update(config['internal_lines'].get(agent_id, []))
+                all_boundary.update(config['boundary_lines'].get(agent_id, []))
+            all_line_ids = {l['id'] for l in config['lines']}
+            classified = all_internal | all_boundary
+            self.assertEqual(all_line_ids, classified,
+                             f"Unclassified lines in {task_id}")
+class TestPartialObservability(unittest.TestCase):
+    """Tests for POMDP zone observations."""
+    def test_partial_obs_returns_zone_obs(self):
+        """reset_multi should return ZoneObservation for each agent."""
+        config = task("task_easy")
+        env = OpenGridEnv(config)
+        zone_obs = env.reset_multi()
+        self.assertEqual(len(zone_obs), config["num_agents"],
+                         "Should have one observation per agent")
+        for agent_id, obs in zone_obs.items():
+            self.assertIsInstance(obs, ZoneObservation)
+            self.assertEqual(obs.agent_id, agent_id)
+    def test_partial_obs_only_shows_local_buses(self):
+        """Each agent should only see buses in their zone."""
+        config = task("task_medium")
+        env = OpenGridEnv(config)
+        zone_obs = env.reset_multi()
+        for agent_id, obs in zone_obs.items():
+            expected_bus_ids = set(config['zone_bus_ids'][agent_id])
+            actual_bus_ids = {b.id for b in obs.local_buses}
+            self.assertEqual(actual_bus_ids, expected_bus_ids,
+                             f"Agent {agent_id} sees wrong buses")
+    def test_frequency_has_noise(self):
+        """POMDP observations should have noisy frequency readings."""
+        config = task("task_easy")
+        env = OpenGridEnv(config)
+        env.reset_multi()
+        # Compare zone obs against full obs from the same reset
+        full_obs = env.state()
+        differences = []
+        for agent_id in range(config['num_agents']):
+            z_obs = env._get_zone_obs(agent_id)
+            diff = abs(z_obs.grid_frequency - full_obs.grid_frequency)
+            differences.append(diff)
+        # At least one agent should see noisy frequency
+        self.assertTrue(any(d > 0.001 for d in differences),
+                        "No frequency noise detected in POMDP observations")
+class TestSafetyLayer(unittest.TestCase):
+    """Tests for the safety constraint filter."""
+    def setUp(self):
+        self.config = task("task_medium")
+        self.safety = SafetyLayer(self.config)
+        self.env = OpenGridEnv(self.config)
+        self.env.reset()
+    def test_zone_boundary_enforcement(self):
+        """Agent should not be able to adjust buses in another zone."""
+        agent_0_buses = set(self.config['zone_bus_ids'][0])
+        other_bus = None
+        for bus_cfg in self.config['buses']:
+            if bus_cfg['id'] not in agent_0_buses:
+                other_bus = bus_cfg['id']
+                break
+        if other_bus is None:
+            self.skipTest("All buses in agent 0's zone (trivial grid)")
+        action = GridAction(bus_adjustments=[
+            BusAdjustment(bus_id=other_bus, delta=10.0)
+        ])
+        corrected, report = self.safety.validate_and_correct(
+            agent_id=0,
+            proposed_action=action,
+            current_line_state=self.env.line_state,
+            current_bus_state=self.env.bus_state,
+            cooldowns=self.env.cooldowns,
+        )
+        self.assertTrue(report.was_corrected, "Should have corrected cross-zone action")
+        self.assertEqual(len(corrected.bus_adjustments), 0,
+                         "Cross-zone adjustment should be removed")
+    def test_safe_action_passes_through(self):
+        """A valid action within the agent's zone should not be corrected."""
+        agent_0_buses = self.config['zone_bus_ids'][0]
+        controllable = None
+        for bus_cfg in self.config['buses']:
+            if bus_cfg['id'] in agent_0_buses and bus_cfg['type'] in ['generator', 'battery', 'slack']:
+                controllable = bus_cfg['id']
+                break
+        if controllable is None:
+            self.skipTest("No controllable bus in agent 0's zone")
+        action = GridAction(bus_adjustments=[
+            BusAdjustment(bus_id=controllable, delta=5.0)
+        ])
+        corrected, report = self.safety.validate_and_correct(
+            agent_id=0,
+            proposed_action=action,
+            current_line_state=self.env.line_state,
+            current_bus_state=self.env.bus_state,
+            cooldowns=self.env.cooldowns,
+        )
+        # Should pass through (may have minor clamping)
+        self.assertEqual(len(corrected.bus_adjustments), 1,
+                         "Valid action should produce one adjustment")
+    def test_islanding_blocked(self):
+        """Opening a bridge line should be blocked by safety layer."""
+        G = nx.Graph()
+        for line in self.config['lines']:
+            G.add_edge(line['from'], line['to'])
+        bridges = list(nx.bridges(G))
+        if not bridges:
+            self.skipTest("No bridges in grid topology")
+        bridge = bridges[0]
+        line_id = next(
+            l['id'] for l in self.config['lines']
+            if (l['from'], l['to']) == bridge or (l['to'], l['from']) == bridge
+        )
+        action = GridAction(topology_actions=[
+            TopologyAction(line_id=line_id, action="open")
+        ])
+        corrected, report = self.safety.validate_and_correct(
+            agent_id=0,
+            proposed_action=action,
+            current_line_state=self.env.line_state,
+            current_bus_state=self.env.bus_state,
+            cooldowns=self.env.cooldowns,
+        )
+        self.assertTrue(report.was_corrected, "Bridge opening should be blocked")
+        self.assertEqual(len(corrected.topology_actions), 0,
+                         "Bridge opening should be removed")
+    def test_duplicate_battery_adjustments_aggregated(self):
+        """Multiple adjustments to the same battery should be aggregated."""
+        battery = next(
+            (b for b in self.config['buses'] if b['type'] == 'battery'), None
+        )
+        if battery is None:
+            self.skipTest("No battery in task")
+        bus_id = battery['id']
+        agent_id = self.config['zone_assignments'].get(bus_id, 0)
+        # Set SOC to a known value
+        for b in self.env.bus_state:
+            if b['id'] == bus_id:
+                b['soc'] = 10.0
+        action = GridAction(bus_adjustments=[
+            BusAdjustment(bus_id=bus_id, delta=8.0),
+            BusAdjustment(bus_id=bus_id, delta=8.0),
+        ])
+        corrected, report = self.safety.validate_and_correct(
+            agent_id=agent_id,
+            proposed_action=action,
+            current_line_state=self.env.line_state,
+            current_bus_state=self.env.bus_state,
+            cooldowns=self.env.cooldowns,
+        )
+        total_delta = sum(a.delta for a in corrected.bus_adjustments)
+        self.assertLessEqual(total_delta, 10.0,
+                             "Combined discharge should not exceed SOC")
+class TestOversightAgent(unittest.TestCase):
+    """Tests for the coordination oversight agent."""
+    def test_no_conflict_scores_high(self):
+        """Cooperative actions should score high coordination."""
+        config = task("task_easy")
+        oversight = OversightAgent(config)
+        # Both agents inject (cooperative)
+        agent_actions = {
+            0: GridAction(bus_adjustments=[BusAdjustment(bus_id=0, delta=5.0)]),
+            1: GridAction(bus_adjustments=[BusAdjustment(bus_id=1, delta=3.0)]),
+        }
+        report = oversight.evaluate(
+            agent_actions=agent_actions,
+            safety_reports={},
+            pre_frequency=49.8,
+            post_frequency=49.9,
+            pre_bus_state=[],
+            post_bus_state=[],
+        )
+        self.assertGreater(report.coordination_score, 0.5,
+                           "Cooperative actions should score > 0.5")
+    def test_reset_clears_history(self):
+        """Resetting oversight should clear intervention history."""
+        config = task("task_easy")
+        oversight = OversightAgent(config)
+        oversight.intervention_history[0] = 5
+        oversight.reset()
+        self.assertEqual(oversight.intervention_history[0], 0)
+class TestMultiAgentStep(unittest.TestCase):
+    """Integration tests for the full multi-agent pipeline."""
+    def test_multi_agent_step_returns_result(self):
+        """step_multi should return a complete MultiAgentStepResult."""
+        config = task("task_easy")
+        env = OpenGridEnv(config)
+        env.reset_multi()
+        # No-op actions for all agents
+        actions = {i: GridAction() for i in range(config['num_agents'])}
+        result = env.step_multi(actions)
+        self.assertEqual(len(result.observations), config['num_agents'])
+        self.assertEqual(len(result.rewards), config['num_agents'])
+        self.assertIsInstance(result.team_reward, float)
+        self.assertIsInstance(result.done, bool)
+        self.assertEqual(len(result.safety_reports), config['num_agents'])
+    def test_safety_reports_match_agent_ids(self):
+        """Safety reports should contain all expected agent IDs."""
+        config = task("task_easy")
+        env = OpenGridEnv(config)
+        env.reset_multi()
+        result = env.step_multi({
+            i: GridAction() for i in range(config['num_agents'])
+        })
+        report_ids = set(result.safety_reports.keys())
+        expected_ids = set(range(config['num_agents']))
+        self.assertEqual(report_ids, expected_ids,
+                         "Safety report agent IDs should match expected agents")
+    def test_multi_agent_episode_completes(self):
+        """A full multi-agent episode should complete without errors."""
+        config = task("task_easy")
+        env = OpenGridEnv(config)
+        env.reset_multi()
+        done = False
+        steps = 0
+        while not done and steps < config['max_steps'] + 5:
+            actions = {i: GridAction() for i in range(config['num_agents'])}
+            result = env.step_multi(actions)
+            done = result.done
+            steps += 1
+        self.assertTrue(done, "Episode should terminate")
+        self.assertLessEqual(steps, config['max_steps'] + 1)
+    def test_backward_compatibility(self):
+        """Single-agent reset/step should still work after multi-agent changes."""
+        for task_id in TASKS:
+            config = task(task_id)
+            env = OpenGridEnv(config)
+            obs = env.reset()
+            self.assertGreater(len(obs.buses), 0,
+                               f"No buses in {task_id}")
+            obs, reward, done, info = env.step(GridAction())
+            self.assertEqual(obs.timestep, 1)
+            self.assertIsInstance(reward.value, float)
+if __name__ == '__main__':
+    unittest.main()

tests/test_solver.py ADDED Viewed

	@@ -0,0 +1,195 @@

+"""
+Tests for core simulation components:
+- DC power flow solver
+- Environment lifecycle (reset, step, terminate)
+- Grading system (scoring, bounds, reproducibility)
+- Baseline heuristic policy
+"""
+import copy
+import unittest
+import numpy as np
+from src.physics import DCSolver, IslandedException
+from src.environment import OpenGridEnv
+from src.tasks import TASKS
+from src.models import GridAction, BusAdjustment
+from src.grader import RobustnessGrader, compute_analytical_ceiling
+from src.baseline import heuristic_policy
+def task(task_id: str):
+    """Get a deep-copied task config to prevent cross-test contamination."""
+    return copy.deepcopy(TASKS[task_id])
+class TestDCSolver(unittest.TestCase):
+    def setUp(self):
+        self.num_buses = 3
+        self.lines = [
+            {'id': 'L01', 'from': 0, 'to': 1, 'susceptance': 100, 'connected': True},
+            {'id': 'L12', 'from': 1, 'to': 2, 'susceptance': 50, 'connected': True},
+            {'id': 'L02', 'from': 0, 'to': 2, 'susceptance': 100, 'connected': True}
+        ]
+        self.solver = DCSolver(self.num_buses)
+        self.solver.update_grid(self.lines)
+    def test_power_flow_balance(self):
+        """Slack bus should absorb any generation/load imbalance."""
+        p_inj = np.array([0.0, 50.0, -100.0])
+        theta, flows, slack_inj = self.solver.solve(p_inj)
+        # Check that flows are computed
+        self.assertIn('L01', flows)
+        self.assertIn('L02', flows)
+    def test_islanding_detection(self):
+        """Disconnecting lines to island bus 2 should raise IslandedException."""
+        with self.assertRaises(IslandedException):
+            broken_lines = [
+                {'id': 'L01', 'from': 0, 'to': 1, 'susceptance': 100, 'connected': True},
+                {'id': 'L12', 'from': 1, 'to': 2, 'susceptance': 50, 'connected': False},
+                {'id': 'L02', 'from': 0, 'to': 2, 'susceptance': 100, 'connected': False}
+            ]
+            self.solver.update_grid(broken_lines)
+    def test_slack_injection_returned(self):
+        """solve() should return slack bus injection as third element."""
+        p_inj = np.array([0.0, 50.0, -100.0])
+        result = self.solver.solve(p_inj)
+        self.assertEqual(len(result), 3)
+        theta, flows, slack_inj = result
+        # Slack should inject ~50 MW to cover the deficit
+        self.assertAlmostEqual(slack_inj, 50.0, places=0)
+    def test_solve_before_update_raises(self):
+        """Calling solve() on a fresh solver should raise RuntimeError."""
+        fresh = DCSolver(3)
+        with self.assertRaises(RuntimeError):
+            fresh.solve(np.array([0.0, 10.0, -10.0]))
+    def test_invalid_bus_index_raises(self):
+        """Lines referencing out-of-range bus IDs should raise ValueError."""
+        bad_lines = [
+            {'id': 'L_bad', 'from': 0, 'to': 99, 'susceptance': 50, 'connected': True},
+        ]
+        solver = DCSolver(3)
+        with self.assertRaises(ValueError):
+            solver.update_grid(bad_lines)
+class TestEnvironment(unittest.TestCase):
+    def test_reset_returns_observation(self):
+        """reset() should return a valid GridObservation."""
+        env = OpenGridEnv(task("task_easy"))
+        obs = env.reset()
+        self.assertEqual(obs.timestep, 0)
+        self.assertGreater(len(obs.buses), 0, "Observation should have buses")
+        self.assertGreater(len(obs.lines), 0, "Observation should have lines")
+    def test_step_returns_tuple(self):
+        """step() should return (obs, reward, done, info)."""
+        env = OpenGridEnv(task("task_easy"))
+        env.reset()
+        obs, reward, done, info = env.step(GridAction())
+        self.assertEqual(obs.timestep, 1)
+        self.assertIsInstance(reward.value, float)
+        self.assertIsInstance(done, bool)
+    def test_reproducibility(self):
+        """Running the same task twice should produce identical initial observations."""
+        env1 = OpenGridEnv(task("task_easy"))
+        obs1 = env1.reset()
+        env2 = OpenGridEnv(task("task_easy"))
+        obs2 = env2.reset()
+        self.assertEqual(obs1.grid_frequency, obs2.grid_frequency)
+        self.assertEqual(len(obs1.buses), len(obs2.buses))
+    def test_episode_terminates(self):
+        """Episode should end after max_steps."""
+        config = task("task_easy")
+        env = OpenGridEnv(config)
+        env.reset()
+        done = False
+        steps = 0
+        while not done and steps < 100:
+            _, _, done, _ = env.step(GridAction())
+            steps += 1
+        self.assertTrue(done, "Episode should terminate")
+        self.assertLessEqual(steps, config["max_steps"])
+    def test_frequency_reasonable(self):
+        """Frequency should stay in a reasonable range for do-nothing agent."""
+        env = OpenGridEnv(task("task_easy"))
+        obs = env.reset()
+        for _ in range(10):
+            obs, _, done, _ = env.step(GridAction())
+            if done:
+                break
+            self.assertGreater(obs.grid_frequency, 40.0,
+                               "Frequency below reasonable minimum")
+            self.assertLess(obs.grid_frequency, 60.0,
+                            "Frequency above reasonable maximum")
+class TestGrader(unittest.TestCase):
+    def test_grader_score_range(self):
+        """Grader should return score strictly in (0, 1) — never 0.0 or 1.0."""
+        grader = RobustnessGrader(task("task_easy"))
+        result = grader.evaluate_policy(heuristic_policy, n_episodes=1)
+        self.assertGreater(result["score"], 0.0)
+        self.assertLess(result["score"], 1.0)
+    def test_grader_all_tasks(self):
+        """Grader should work on all registered tasks."""
+        for task_id, config in TASKS.items():
+            grader = RobustnessGrader(copy.deepcopy(config))
+            result = grader.evaluate_policy(heuristic_policy, n_episodes=1)
+            self.assertIn("score", result, f"Missing 'score' for {task_id}")
+            self.assertIn("avg_raw_reward", result,
+                          f"Missing 'avg_raw_reward' for {task_id}")
+class TestBaseline(unittest.TestCase):
+    def test_heuristic_returns_valid_action(self):
+        """Heuristic policy should return a valid GridAction."""
+        env = OpenGridEnv(task("task_easy"))
+        obs = env.reset()
+        action = heuristic_policy(obs)
+        self.assertIsInstance(action, GridAction)
+class TestReproducibility(unittest.TestCase):
+    def test_floor_deterministic(self):
+        """Two calls to _estimate_bounds should produce identical floors (seeded RNG)."""
+        grader1 = RobustnessGrader(task("task_easy"))
+        grader1._estimate_bounds(n_samples=3)
+        grader2 = RobustnessGrader(task("task_easy"))
+        grader2._estimate_bounds(n_samples=3)
+        self.assertEqual(grader1.reward_floor, grader2.reward_floor,
+                         "Floor should be deterministic with same seed")
+    def test_ceiling_is_analytical(self):
+        """Ceiling should be max_steps * 1.2, not an empirical estimate."""
+        config = task("task_easy")
+        grader = RobustnessGrader(config)
+        bounds = grader.get_bounds()
+        expected_ceiling = compute_analytical_ceiling(config["max_steps"])
+        self.assertEqual(bounds["reward_ceiling"], expected_ceiling,
+                         "Ceiling should match analytical formula")
+    def test_heuristic_score_below_one(self):
+        """With analytical ceiling, heuristic should score < 1.0 (not degenerate)."""
+        grader = RobustnessGrader(task("task_easy"))
+        result = grader.evaluate_policy(heuristic_policy, n_episodes=1)
+        self.assertLess(result["score"], 1.0)
+        self.assertGreater(result["score"], 0.0)
+if __name__ == '__main__':
+    unittest.main()

training/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Training module for OpenGrid GRPO pipeline

training/opengrid_grpo_colab.ipynb ADDED Viewed

	@@ -0,0 +1,635 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 🔋 OpenGrid — GRPO Training Notebook\n",
+    "\n",
+    "**Multi-Agent RL for Power Grid Operations**\n",
+    "\n",
+    "This notebook trains an LLM (Qwen 2.5 1.5B) to operate a power grid using GRPO (Group Relative Policy Optimization).\n",
+    "\n",
+    "- **Environment**: OpenGrid — multi-agent POMDP with safety layer & oversight agent\n",
+    "- **Task**: Maintain 50 Hz frequency, prevent line overloads, avoid blackouts\n",
+    "- **Training**: TRL GRPOTrainer + Unsloth 4-bit quantization\n",
+    "\n",
+    "⚡ **Runtime**: Select `T4 GPU` from Runtime → Change runtime type"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Install Dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "!pip install unsloth\n",
+    "!pip install --no-deps trl peft accelerate bitsandbytes\n",
+    "!pip install fastapi uvicorn pydantic numpy networkx matplotlib openai httpx datasets"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Clone OpenGrid Repository"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "# ⚠️ UPDATE THIS with your actual repo URL\n",
+    "REPO_URL = \"https://github.com/krishnagoyal099/Opengrid_env.git\"\n",
+    "\n",
+    "if not os.path.exists(\"opengrid\"):\n",
+    "    !git clone {REPO_URL} opengrid\n",
+    "else:\n",
+    "    !cd opengrid && git pull\n",
+    "\n",
+    "os.chdir(\"opengrid\")\n",
+    "print(f\"Working directory: {os.getcwd()}\")\n",
+    "!ls -la"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Verify GPU & Environment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "print(f\"PyTorch: {torch.__version__}\")\n",
+    "print(f\"CUDA available: {torch.cuda.is_available()}\")\n",
+    "if torch.cuda.is_available():\n",
+    "    print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n",
+    "    print(f\"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB\")\n",
+    "else:\n",
+    "    print(\"⚠️ No GPU detected! Go to Runtime → Change runtime type → T4 GPU\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Verify OpenGrid imports work\n",
+    "import sys\n",
+    "sys.path.insert(0, '.')\n",
+    "\n",
+    "from src.environment import OpenGridEnv\n",
+    "from src.tasks import TASKS\n",
+    "from src.models import GridAction, BusAdjustment\n",
+    "\n",
+    "print(f\"Available tasks: {list(TASKS.keys())}\")\n",
+    "for tid, cfg in TASKS.items():\n",
+    "    print(f\"  {tid}: {cfg['num_buses']} buses, {cfg['num_agents']} agents, {cfg.get('difficulty','')}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Run Test Mode (Pipeline Verification)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!python training/train_grpo.py --test-mode"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Baseline Evaluation (Before Training)\n",
+    "\n",
+    "Run the heuristic policy to get baseline scores. We'll compare against this after training."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import re\n",
+    "import numpy as np\n",
+    "from src.environment import OpenGridEnv\n",
+    "from src.tasks import TASKS\n",
+    "from src.models import GridAction, BusAdjustment\n",
+    "from training.train_grpo import (\n",
+    "    rollout_multi_agent, format_observation_prompt, extract_action\n",
+    ")\n",
+    "\n",
+    "def heuristic_generate(prompt):\n",
+    "    \"\"\"Simple proportional controller as baseline.\"\"\"\n",
+    "    freq_match = re.search(r'Frequency: ([\\d.]+)', prompt)\n",
+    "    freq = float(freq_match.group(1)) if freq_match else 50.0\n",
+    "    error = 50.0 - freq\n",
+    "    delta = max(-20, min(20, error * 10))\n",
+    "    bus_match = re.search(r'Bus (\\d+) \\((generator|battery|slack)\\)', prompt)\n",
+    "    if bus_match:\n",
+    "        return json.dumps({\"bus_adjustments\": [{\"bus_id\": int(bus_match.group(1)), \"delta\": round(delta, 1)}], \"topology_actions\": []})\n",
+    "    return json.dumps({\"bus_adjustments\": [], \"topology_actions\": []})\n",
+    "\n",
+    "# Evaluate baseline on all tasks\n",
+    "baseline_results = {}\n",
+    "for task_id in [\"task_easy\", \"task_medium\", \"task_karnataka\"]:\n",
+    "    if task_id not in TASKS:\n",
+    "        continue\n",
+    "    config = TASKS[task_id]\n",
+    "    rewards = []\n",
+    "    import copy\n",
+    "    for ep in range(5):\n",
+    "        ep_config = copy.deepcopy(config)\n",
+    "        ep_config['seed'] = 42 + ep\n",
+    "        env = OpenGridEnv(ep_config)\n",
+    "        result = rollout_multi_agent(env, heuristic_generate, ep_config)\n",
+    "        rewards.append(result['total_reward'])\n",
+    "    baseline_results[task_id] = {\n",
+    "        \"avg_reward\": np.mean(rewards),\n",
+    "        \"std_reward\": np.std(rewards),\n",
+    "        \"rewards\": rewards\n",
+    "    }\n",
+    "    print(f\"[BASELINE] {task_id}: {np.mean(rewards):.2f} ± {np.std(rewards):.2f}\")\n",
+    "\n",
+    "# Save baseline for later comparison\n",
+    "import pickle\n",
+    "os.makedirs(\"training/outputs\", exist_ok=True)\n",
+    "with open(\"training/outputs/baseline_results.pkl\", \"wb\") as f:\n",
+    "    pickle.dump(baseline_results, f)\n",
+    "print(\"\\n✅ Baseline scores saved.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. Load Model with Unsloth (4-bit Quantized)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from unsloth import FastLanguageModel\n",
+    "\n",
+    "MODEL_NAME = \"unsloth/Qwen2.5-1.5B-Instruct-bnb-4bit\"\n",
+    "\n",
+    "model, tokenizer = FastLanguageModel.from_pretrained(\n",
+    "    model_name=MODEL_NAME,\n",
+    "    max_seq_length=2048,\n",
+    "    load_in_4bit=True,\n",
+    ")\n",
+    "\n",
+    "model = FastLanguageModel.get_peft_model(\n",
+    "    model,\n",
+    "    r=16,\n",
+    "    lora_alpha=16,\n",
+    "    lora_dropout=0,\n",
+    "    target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n",
+    "                    \"gate_proj\", \"up_proj\", \"down_proj\"],\n",
+    ")\n",
+    "\n",
+    "if tokenizer.pad_token is None:\n",
+    "    tokenizer.pad_token = tokenizer.eos_token\n",
+    "\n",
+    "print(f\"✅ Model loaded: {MODEL_NAME}\")\n",
+    "print(f\"   Trainable params: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 7. Generate Training Prompts from Environment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import copy\n",
+    "import json as _json\n",
+    "import numpy as np\n",
+    "from training.train_grpo import SYSTEM_PROMPT, format_observation_prompt\n",
+    "\n",
+    "TRAIN_TASK = \"task_karnataka\"  # Change to task_easy for faster first run\n",
+    "NUM_EPISODES = 30\n",
+    "\n",
+    "task_config = TASKS[TRAIN_TASK]\n",
+    "base_seed = task_config.get('seed', 42)\n",
+    "\n",
+    "prompts = []\n",
+    "obs_contexts = []  # stored as JSON strings to satisfy PyArrow schema inference\n",
+    "\n",
+    "for episode in range(NUM_EPISODES):\n",
+    "    ep_config = copy.deepcopy(task_config)\n",
+    "    ep_config['seed'] = base_seed + episode\n",
+    "    env = OpenGridEnv(ep_config)\n",
+    "    zone_obs = env.reset_multi()\n",
+    "\n",
+    "    for t in range(min(10, task_config['max_steps'])):\n",
+    "        for agent_id, obs in zone_obs.items():\n",
+    "            # model_dump_json() → json.loads() ensures all keys are strings\n",
+    "            obs_dict = _json.loads(obs.model_dump_json())\n",
+    "            prompt_text = format_observation_prompt(obs_dict, zone_name=obs.zone_name)\n",
+    "            messages = [\n",
+    "                {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n",
+    "                {\"role\": \"user\", \"content\": prompt_text},\n",
+    "            ]\n",
+    "            formatted = tokenizer.apply_chat_template(\n",
+    "                messages, tokenize=False, add_generation_prompt=True\n",
+    "            )\n",
+    "            prompts.append(formatted)\n",
+    "            # Store as JSON string — flat scalar, no schema-inference issues\n",
+    "            obs_contexts.append(_json.dumps(obs_dict))\n",
+    "\n",
+    "        # Advance env with diverse random actions (no slack bus)\n",
+    "        random_actions = {}\n",
+    "        for aid in range(env.num_agents):\n",
+    "            zone_buses = task_config['zone_bus_ids'].get(aid, [])\n",
+    "            controllable = [bid for bid in zone_buses\n",
+    "                if next((b for b in task_config['buses'] if b['id'] == bid), {}).get('type')\n",
+    "                in ['generator', 'battery']]\n",
+    "            adj = []\n",
+    "            if controllable:\n",
+    "                bid = np.random.choice(controllable)\n",
+    "                adj = [BusAdjustment(bus_id=int(bid), delta=float(np.random.uniform(-15, 15)))]\n",
+    "            random_actions[aid] = GridAction(bus_adjustments=adj)\n",
+    "\n",
+    "        result = env.step_multi(random_actions)\n",
+    "        if result.done:\n",
+    "            break\n",
+    "        zone_obs = result.observations\n",
+    "\n",
+    "print(f\"✅ Generated {len(prompts)} training prompts\")\n",
+    "print(f\"\\nSample prompt (first 400 chars):\")\n",
+    "print(prompts[0][:400])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 8. Define GRPO Reward Function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json as _json\n",
+    "from training.train_grpo import compute_grpo_reward, extract_action\n",
+    "\n",
+    "def reward_fn(completions, obs_context=None, **kwargs):\n",
+    "    \"\"\"GRPO-compatible reward function for OpenGrid.\n",
+    "    obs_context arrives as JSON strings from the dataset column.\n",
+    "    \"\"\"\n",
+    "    texts = []\n",
+    "    for c in completions:\n",
+    "        if isinstance(c, list):\n",
+    "            text = c[-1]['content'] if c else \"\"\n",
+    "        else:\n",
+    "            text = str(c)\n",
+    "        texts.append(text)\n",
+    "\n",
+    "    # Deserialize JSON strings → dicts for the reward scorer\n",
+    "    if obs_context is None:\n",
+    "        batch_obs = [None] * len(texts)\n",
+    "    else:\n",
+    "        batch_obs = [\n",
+    "            _json.loads(ctx) if isinstance(ctx, str) else ctx\n",
+    "            for ctx in obs_context\n",
+    "        ]\n",
+    "    return compute_grpo_reward(texts, batch_obs)\n",
+    "\n",
+    "# Quick sanity test\n",
+    "test_rewards = reward_fn([\n",
+    "    '{\"bus_adjustments\": [{\"bus_id\": 1, \"delta\": 5.0}], \"topology_actions\": []}',\n",
+    "    'invalid json here',\n",
+    "])\n",
+    "print(f\"Test rewards: {test_rewards}\")\n",
+    "assert len(test_rewards) == 2, \"reward_fn must return one score per completion\"\n",
+    "print(\"✅ reward_fn OK\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 9. Train with GRPO 🚀"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from trl import GRPOTrainer, GRPOConfig\n",
+    "from datasets import Dataset\n",
+    "\n",
+    "_cuda_ok = torch.cuda.is_available()\n",
+    "_bf16   = _cuda_ok and torch.cuda.is_bf16_supported()\n",
+    "_fp16   = _cuda_ok and not _bf16\n",
+    "\n",
+    "grpo_config = GRPOConfig(\n",
+    "    output_dir=\"training/outputs/grpo_checkpoints\",\n",
+    "    num_train_epochs=1,\n",
+    "    per_device_train_batch_size=2,\n",
+    "    gradient_accumulation_steps=4,\n",
+    "    learning_rate=5e-6,\n",
+    "    logging_steps=5,\n",
+    "    save_steps=50,\n",
+    "    max_completion_length=256,\n",
+    "    num_generations=4,\n",
+    "    report_to=\"none\",\n",
+    "    remove_unused_columns=False,\n",
+    "    bf16=_bf16,\n",
+    "    fp16=_fp16,\n",
+    ")\n",
+    "\n",
+    "# obs_contexts are JSON strings — PyArrow handles flat strings with no issues\n",
+    "train_dataset = Dataset.from_dict({\"prompt\": prompts, \"obs_context\": obs_contexts})\n",
+    "print(f\"Dataset: {len(train_dataset)} rows, columns: {train_dataset.column_names}\")\n",
+    "\n",
+    "trainer = GRPOTrainer(\n",
+    "    model=model,\n",
+    "    args=grpo_config,\n",
+    "    train_dataset=train_dataset,\n",
+    "    reward_funcs=reward_fn,\n",
+    "    processing_class=tokenizer,\n",
+    ")\n",
+    "\n",
+    "print(f\"Training on {len(prompts)} prompts, {grpo_config.num_train_epochs} epoch(s)\")\n",
+    "print(f\"Effective batch size: {grpo_config.per_device_train_batch_size * grpo_config.gradient_accumulation_steps}\")\n",
+    "print(\"\\n🚀 Starting GRPO training...\")\n",
+    "\n",
+    "train_result = trainer.train()\n",
+    "\n",
+    "print(\"\\n✅ Training complete!\")\n",
+    "print(f\"   Total steps: {trainer.state.global_step}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 10. Save Trained Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "OUTPUT_PATH = \"training/outputs/trained_model\"\n",
+    "trainer.save_model(OUTPUT_PATH)\n",
+    "tokenizer.save_pretrained(OUTPUT_PATH)\n",
+    "print(f\"✅ Model saved to {OUTPUT_PATH}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 11. Evaluate Trained Model (After Training)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import pipeline\n",
+    "\n",
+    "# Create generation function from trained model\n",
+    "FastLanguageModel.for_inference(model)\n",
+    "\n",
+    "def trained_generate(prompt):\n",
+    "    \"\"\"Generate action using the trained model.\"\"\"\n",
+    "    messages = [\n",
+    "        {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n",
+    "        {\"role\": \"user\", \"content\": prompt},\n",
+    "    ]\n",
+    "    formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n",
+    "    inputs = tokenizer(formatted, return_tensors=\"pt\").to(model.device)\n",
+    "    with torch.no_grad():\n",
+    "        outputs = model.generate(\n",
+    "            **inputs,\n",
+    "            max_new_tokens=256,\n",
+    "            temperature=0.3,\n",
+    "            do_sample=True,\n",
+    "        )\n",
+    "    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)\n",
+    "    return response\n",
+    "\n",
+    "# Evaluate on same tasks as baseline\n",
+    "trained_results = {}\n",
+    "for task_id in [\"task_easy\", \"task_medium\", \"task_karnataka\"]:\n",
+    "    if task_id not in TASKS:\n",
+    "        continue\n",
+    "    config = TASKS[task_id]\n",
+    "    rewards = []\n",
+    "    import copy\n",
+    "    for ep in range(5):\n",
+    "        ep_config = copy.deepcopy(config)\n",
+    "        ep_config['seed'] = 42 + ep\n",
+    "        env = OpenGridEnv(ep_config)\n",
+    "        result = rollout_multi_agent(env, trained_generate, ep_config)\n",
+    "        rewards.append(result['total_reward'])\n",
+    "        print(f\"  {task_id} ep{ep}: reward={result['total_reward']:.2f}, blackout={result['is_blackout']}\")\n",
+    "    trained_results[task_id] = {\n",
+    "        \"avg_reward\": np.mean(rewards),\n",
+    "        \"std_reward\": np.std(rewards),\n",
+    "        \"rewards\": rewards\n",
+    "    }\n",
+    "    print(f\"[TRAINED] {task_id}: {np.mean(rewards):.2f} ± {np.std(rewards):.2f}\\n\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 12. Generate Before/After Plots 📊"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import pickle\n",
+    "\n",
+    "# Load baseline\n",
+    "with open(\"training/outputs/baseline_results.pkl\", \"rb\") as f:\n",
+    "    baseline_results = pickle.load(f)\n",
+    "\n",
+    "# ── Plot 1: Before vs After Bar Chart ──\n",
+    "common_tasks = [t for t in baseline_results if t in trained_results]\n",
+    "fig, ax = plt.subplots(figsize=(10, 6))\n",
+    "x = np.arange(len(common_tasks))\n",
+    "width = 0.35\n",
+    "\n",
+    "before_vals = [baseline_results[t]['avg_reward'] for t in common_tasks]\n",
+    "after_vals  = [trained_results[t]['avg_reward']  for t in common_tasks]\n",
+    "\n",
+    "bars1 = ax.bar(x - width/2, before_vals, width, label='Heuristic Baseline', color='#ff6b6b', alpha=0.8)\n",
+    "bars2 = ax.bar(x + width/2, after_vals,  width, label='GRPO Trained',       color='#00d4aa', alpha=0.8)\n",
+    "\n",
+    "ax.set_xlabel('Task', fontsize=12)\n",
+    "ax.set_ylabel('Average Episode Reward', fontsize=12)\n",
+    "ax.set_title('OpenGrid — GRPO Training: Before vs After', fontsize=14, fontweight='bold')\n",
+    "ax.set_xticks(x)\n",
+    "ax.set_xticklabels([t.replace('task_', '').title() for t in common_tasks])\n",
+    "ax.legend(fontsize=11)\n",
+    "ax.grid(True, alpha=0.3, axis='y')\n",
+    "\n",
+    "# Fix label positioning for negative bar heights\n",
+    "for bars in (bars1, bars2):\n",
+    "    for bar in bars:\n",
+    "        h = bar.get_height()\n",
+    "        ax.text(\n",
+    "            bar.get_x() + bar.get_width() / 2.,\n",
+    "            h + (2 if h >= 0 else -5),\n",
+    "            f'{h:.1f}',\n",
+    "            ha='center', va='bottom' if h >= 0 else 'top', fontsize=10\n",
+    "        )\n",
+    "\n",
+    "plt.tight_layout()\n",
+    "plt.savefig('training/outputs/before_after.png', dpi=150)\n",
+    "plt.show()\n",
+    "print(\"✅ Saved: training/outputs/before_after.png\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ── Plot 2: Training Reward Curve ──\n",
+    "history = trainer.state.log_history\n",
+    "\n",
+    "steps = [h['step'] for h in history if 'loss' in h]\n",
+    "losses = [h['loss'] for h in history if 'loss' in h]\n",
+    "\n",
+    "fig, ax = plt.subplots(figsize=(10, 5))\n",
+    "ax.plot(steps, losses, color='#ff6b6b', linewidth=1.5, alpha=0.6, label='Loss')\n",
+    "if len(losses) > 10:\n",
+    "    window = min(20, len(losses) // 3)\n",
+    "    smoothed = np.convolve(losses, np.ones(window)/window, mode='valid')\n",
+    "    ax.plot(steps[window-1:], smoothed, color='#ff6b6b', linewidth=2.5, label=f'Smoothed (w={window})')\n",
+    "\n",
+    "ax.set_xlabel('Training Step', fontsize=12)\n",
+    "ax.set_ylabel('Loss', fontsize=12)\n",
+    "ax.set_title('OpenGrid GRPO — Training Loss', fontsize=14, fontweight='bold')\n",
+    "ax.legend()\n",
+    "ax.grid(True, alpha=0.3)\n",
+    "plt.tight_layout()\n",
+    "plt.savefig('training/outputs/training_loss.png', dpi=150)\n",
+    "plt.show()\n",
+    "print(\"✅ Saved: training/outputs/training_loss.png\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 13. Summary & Next Steps\n",
+    "\n",
+    "### Results Table"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"=\"*60)\n",
+    "print(\"  OpenGrid GRPO Training — Results Summary\")\n",
+    "print(\"=\"*60)\n",
+    "\n",
+    "# Rebuild common_tasks in case Cell 12 was skipped\n",
+    "common_tasks = [t for t in baseline_results if t in trained_results]\n",
+    "\n",
+    "print(f\"{'Task':<20} {'Baseline':>12} {'Trained':>12} {'Δ':>10}\")\n",
+    "print(\"-\"*60)\n",
+    "for t in common_tasks:\n",
+    "    b = baseline_results[t]['avg_reward']\n",
+    "    a = trained_results[t]['avg_reward']\n",
+    "    delta = a - b\n",
+    "    arrow = '↑' if delta > 0 else '↓'\n",
+    "    print(f\"{t:<20} {b:>10.2f}   {a:>10.2f}   {arrow} {abs(delta):.2f}\")\n",
+    "print(\"=\"*60)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Download plots for your README\n",
+    "from google.colab import files\n",
+    "files.download('training/outputs/before_after.png')\n",
+    "files.download('training/outputs/training_loss.png')"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}

training/train_grpo.py ADDED Viewed

	@@ -0,0 +1,827 @@

+"""
+OpenGrid GRPO Training Script
+==============================
+Uses TRL's GRPOTrainer to train an LLM for multi-agent power grid control.
+The LLM receives grid observations (partial, per-zone) as text prompts,
+generates JSON actions, and is trained via GRPO to maximize grid stability rewards.
+Compatible with:
+- Unsloth for 4-bit quantized training (recommended)
+- HuggingFace TRL GRPOTrainer
+- Colab / HF Spaces with GPU
+Usage:
+    # Quick test (no GPU needed, just verifies the pipeline)
+    python training/train_grpo.py --test-mode
+    # Full training on GPU
+    python training/train_grpo.py --model Qwen/Qwen2.5-1.5B-Instruct --epochs 3
+    # With Unsloth quantization (faster, less memory)
+    python training/train_grpo.py --model unsloth/Qwen2.5-1.5B-Instruct-bnb-4bit --use-unsloth
+"""
+import argparse
+import copy
+import json
+import random
+import sys
+import os
+import re
+import time
+from pathlib import Path
+# Add project root to path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+import numpy as np
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+from src.environment import OpenGridEnv
+from src.tasks import TASKS
+from src.models import GridAction, BusAdjustment, TopologyAction
+# ============================================================================
+# Prompt Engineering
+# ============================================================================
+SYSTEM_PROMPT = """You are an AI power grid operator for the Karnataka Power Transmission Corporation (KPTCL).
+You manage one zone of a multi-agent grid. Your goal: keep frequency at 50.0 Hz, avoid line overloads, and prevent blackouts.
+You receive partial observations of your zone and must output a JSON action.
+Respond ONLY with valid JSON matching this schema:
+{"bus_adjustments": [{"bus_id": <int>, "delta": <float>}], "topology_actions": []}
+Rules:
+- Positive delta = inject more power (discharge battery / increase generation)
+- Negative delta = reduce injection (charge battery / decrease generation)
+- Only adjust buses in YOUR zone
+- Keep frequency close to 50.0 Hz
+- Avoid overloading lines (rho > 1.0 is dangerous)"""
+def format_observation_prompt(obs_dict: dict, zone_name: str = "") -> str:
+    """Convert a zone observation to a text prompt for the LLM."""
+    freq = obs_dict.get('grid_frequency', 50.0)
+    timestep = obs_dict.get('timestep', 0)
+    prompt = f"[Zone: {zone_name}] Step {timestep} | Frequency: {freq:.3f} Hz"
+    freq_error = freq - 50.0
+    if abs(freq_error) > 0.3:
+        prompt += f" [!] CRITICAL: {freq_error:+.3f} Hz deviation!"
+    elif abs(freq_error) > 0.1:
+        prompt += f" WARNING: {freq_error:+.3f} Hz deviation"
+    # Local buses
+    buses = obs_dict.get('local_buses', [])
+    if buses:
+        prompt += "\n\nYour buses:"
+        for b in buses:
+            bus_info = f"  Bus {b['id']} ({b['type']}): {b['p_injection']:.1f} MW"
+            if b['type'] == 'battery':
+                bus_info += f" | SoC: {b['soc']:.1f} MWh"
+            prompt += f"\n{bus_info}"
+    # Lines
+    all_lines = obs_dict.get('internal_lines', []) + obs_dict.get('boundary_lines', [])
+    overloaded = [l for l in all_lines if l.get('rho', 0) > 0.8 and l.get('connected', True)]
+    if overloaded:
+        prompt += "\n\n[!] Stressed lines:"
+        for l in overloaded:
+            prompt += f"\n  {l['id']}: {l['rho']:.2f} loading ({l['flow']:.1f} MW)"
+    # Neighbor signals
+    neighbors = obs_dict.get('neighbor_signals', {})
+    if neighbors:
+        prompt += "\n\nNeighbor zones (avg injection):"
+        for nid, val in neighbors.items():
+            prompt += f"\n  Zone {nid}: {val:.1f} MW"
+    # Zone summary
+    zone_load = obs_dict.get('zone_load_mw', 0)
+    zone_gen = obs_dict.get('zone_gen_mw', 0)
+    if zone_load or zone_gen:
+        prompt += f"\n\nZone balance: Gen={zone_gen:.1f} MW, Load={zone_load:.1f} MW, Net={zone_gen-zone_load:.1f} MW"
+    prompt += "\n\nWhat action do you take? Respond with JSON only."
+    return prompt
+def extract_action(text: str) -> GridAction:
+    """Parse LLM output to a GridAction, with fallback for malformed JSON."""
+    text = text.strip()
+    # Try to find JSON in the response
+    json_match = re.search(r'\{[\s\S]*\}', text)
+    if json_match:
+        try:
+            data = json.loads(json_match.group())
+            return GridAction(
+                bus_adjustments=[
+                    BusAdjustment(**a) for a in data.get('bus_adjustments', [])
+                ],
+                topology_actions=[
+                    TopologyAction(**t) for t in data.get('topology_actions', [])
+                ],
+            )
+        except (json.JSONDecodeError, Exception):
+            pass
+    # Fallback: no-op action
+    return GridAction()
+# ============================================================================
+# Environment Rollout
+# ============================================================================
+def rollout_single_agent(env: OpenGridEnv, generate_fn, task_config: dict) -> dict:
+    """Run one episode in single-agent mode. Returns episode data."""
+    obs = env.reset()
+    total_reward = 0.0
+    rewards = []
+    steps = 0
+    is_blackout = False
+    for t in range(task_config['max_steps']):
+        obs_dict = obs.model_dump()
+        prompt = format_observation_prompt(obs_dict, zone_name="Full_Grid")
+        response = generate_fn(prompt)
+        action = extract_action(response)
+        obs, reward, done, info = env.step(action)
+        total_reward += reward.value
+        rewards.append(reward.value)
+        steps += 1
+        if done:
+            is_blackout = info.is_blackout
+            break
+    return {
+        "total_reward": total_reward,
+        "rewards": rewards,
+        "steps": steps,
+        "is_blackout": is_blackout,
+        "avg_reward": total_reward / max(steps, 1),
+    }
+def rollout_multi_agent(env: OpenGridEnv, generate_fn, task_config: dict) -> dict:
+    """Run one episode in multi-agent mode. Returns episode data."""
+    zone_obs = env.reset_multi()
+    total_reward = 0.0
+    rewards = []
+    per_agent_rewards = {i: [] for i in range(env.num_agents)}
+    steps = 0
+    safety_interventions = 0
+    is_blackout = False
+    for t in range(task_config['max_steps']):
+        agent_actions = {}
+        for agent_id, obs in zone_obs.items():
+            obs_dict = obs.model_dump()
+            prompt = format_observation_prompt(obs_dict, zone_name=obs.zone_name)
+            response = generate_fn(prompt)
+            action = extract_action(response)
+            agent_actions[agent_id] = action
+        result = env.step_multi(agent_actions)
+        total_reward += result.team_reward
+        rewards.append(result.team_reward)
+        for aid, r in result.rewards.items():
+            per_agent_rewards[aid].append(r.value)
+        # safety_reports is Dict[int, SafetyReport] — iterate values
+        safety_interventions += sum(
+            1 for sr in result.safety_reports.values() if sr.was_corrected
+        )
+        steps += 1
+        if result.done:
+            is_blackout = result.info.is_blackout
+            break
+        zone_obs = result.observations
+    return {
+        "total_reward": total_reward,
+        "rewards": rewards,
+        "per_agent_rewards": per_agent_rewards,
+        "steps": steps,
+        "is_blackout": is_blackout,
+        "safety_interventions": safety_interventions,
+        "avg_reward": total_reward / max(steps, 1),
+    }
+# ============================================================================
+# GRPO Reward Functions
+# ============================================================================
+# Cache task configs to avoid re-deepcopy on every reward call
+_REWARD_ENV_CACHE = {}
+def _get_reward_env(task_config: dict) -> OpenGridEnv:
+    """Get a fresh environment for reward computation."""
+    env = OpenGridEnv(copy.deepcopy(task_config))
+    env.reset()
+    return env
+def compute_grpo_reward_env(
+    completions: list,
+    observations: list,
+    task_config: dict,
+    horizon: int = 3,
+) -> list:
+    """Environment-grounded reward: step the actual physics simulation.
+    For each LLM-generated action:
+    1. Restore the env to the observation state
+    2. Step with the proposed action and get the real reward
+    3. Run a short rollout (horizon steps) with heuristic continuation
+       to capture trajectory-level impact
+    4. Add format/schema bonuses
+    This directly addresses the proxy-reward disconnect that caused
+    the original GRPO training to show zero improvement.
+    """
+    from src.baseline import heuristic_policy
+    rewards = []
+    for completion, obs_dict in zip(completions, observations):
+        if obs_dict is None:
+            rewards.append(0.0)
+            continue
+        # Deserialize if needed (TRL may pass strings)
+        if isinstance(obs_dict, str):
+            try:
+                obs_dict = json.loads(obs_dict)
+            except (json.JSONDecodeError, TypeError):
+                rewards.append(0.0)
+                continue
+        action = extract_action(completion)
+        has_adjustments = bool(action.bus_adjustments)
+        # ── 1. Format reward (small but keeps gradient alive) ──
+        format_score = 0.0
+        if has_adjustments:
+            format_score += 0.05
+        else:
+            freq = obs_dict.get('grid_frequency', 50.0)
+            if abs(freq - 50.0) < 0.05:
+                format_score += 0.05  # No-op when stable is fine
+            else:
+                format_score -= 0.05  # No-op during deviation is bad
+        # ── 2. Environment-grounded reward ──
+        try:
+            env = _get_reward_env(task_config)
+            env._set_state(obs_dict)
+            # Step with the LLM's proposed action
+            obs_after, reward, done, info = env.step(action)
+            env_score = reward.value
+            # Blackout = catastrophic
+            if info.is_blackout:
+                rewards.append(-1.0)
+                continue
+            # ── 3. Mini-rollout: what happens next? ──
+            # Run a few more steps with heuristic to measure trajectory impact
+            rollout_reward = 0.0
+            for _ in range(horizon - 1):
+                if done:
+                    break
+                h_action = heuristic_policy(obs_after)
+                obs_after, r, done, info = env.step(h_action)
+                rollout_reward += r.value
+                if info.is_blackout:
+                    rollout_reward -= 10.0
+                    break
+            # Combine: immediate reward + discounted future
+            total_env_score = env_score + 0.5 * rollout_reward
+            # Normalize to [-1, 1] range
+            # Typical per-step reward is ~0.5 to 1.5, rollout adds ~1-4
+            # So total_env_score is roughly in [-10, 4] range
+            normalized = total_env_score / 5.0
+        except Exception as e:
+            # Fallback: use lightweight heuristic scoring
+            normalized = _compute_heuristic_score(action, obs_dict)
+        total = format_score + normalized
+        rewards.append(max(-1.0, min(1.0, total)))
+    return rewards
+def _compute_heuristic_score(action: GridAction, obs_dict: dict) -> float:
+    """Lightweight fallback scorer when env rollout fails."""
+    score = 0.0
+    freq = obs_dict.get('grid_frequency', 50.0)
+    freq_error = freq - 50.0
+    abs_error = abs(freq_error)
+    if not action.bus_adjustments:
+        return 0.0
+    total_delta = sum(a.delta for a in action.bus_adjustments)
+    # Direction
+    if abs_error > 0.05:
+        correct = (freq_error < 0 and total_delta > 0) or \
+                  (freq_error > 0 and total_delta < 0)
+        score += 0.3 if correct else -0.3
+    # Proportionality
+    if abs_error > 0.05:
+        ideal = abs(freq_error) * 15.0
+        actual = abs(total_delta)
+        if actual > 0.1:
+            ratio = min(actual, ideal) / max(actual, ideal, 0.1)
+            score += 0.2 * ratio
+    # Stability
+    if abs_error < 0.05 and abs(total_delta) < 2.0:
+        score += 0.1
+    return max(-0.5, min(0.5, score))
+# Keep old function for backward compat / test mode
+def compute_grpo_reward(completions: list, observations: list, env_url: str = None) -> list:
+    """Legacy heuristic reward (used in test mode only)."""
+    return [_compute_heuristic_score(extract_action(c), o or {})
+            for c, o in zip(completions, observations)]
+# ============================================================================
+# Training Loop
+# ============================================================================
+def train_grpo(args):
+    """Main GRPO training loop using TRL."""
+    try:
+        from trl import GRPOTrainer, GRPOConfig
+        from transformers import AutoTokenizer, AutoModelForCausalLM
+    except ImportError:
+        print("ERROR: TRL not installed. Run: pip install trl transformers")
+        print("For quantized training: pip install unsloth")
+        sys.exit(1)
+    print(f"[TRAIN] Model: {args.model}")
+    print(f"[TRAIN] Task: {args.task}")
+    print(f"[TRAIN] Epochs: {args.epochs}")
+    print(f"[TRAIN] Batch size: {args.batch_size}")
+    # Load model
+    if args.use_unsloth:
+        try:
+            from unsloth import FastLanguageModel
+            model, tokenizer = FastLanguageModel.from_pretrained(
+                model_name=args.model,
+                max_seq_length=2048,
+                load_in_4bit=True,
+            )
+            model = FastLanguageModel.get_peft_model(
+                model,
+                r=16, lora_alpha=16, lora_dropout=0,
+                target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
+                                "gate_proj", "up_proj", "down_proj"],
+            )
+            print("[TRAIN] Loaded with Unsloth 4-bit quantization")
+        except ImportError:
+            print("WARNING: Unsloth not available, falling back to standard loading")
+            tokenizer = AutoTokenizer.from_pretrained(args.model)
+            model = AutoModelForCausalLM.from_pretrained(args.model)
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(args.model)
+        model = AutoModelForCausalLM.from_pretrained(args.model)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    # Prepare training data: observation prompts from the environment
+    task_config = copy.deepcopy(TASKS[args.task])
+    base_seed = task_config.get('seed', 42)
+    # Generate prompts with diverse grid states:
+    # - Larger random perturbations (-30 to +30 MW)
+    # - Adversarial states (drained batteries, high frequency deviation)
+    # - More steps per episode for temporal diversity
+    print("[TRAIN] Generating training prompts from environment...")
+    prompts = []
+    obs_contexts = []
+    rng = np.random.RandomState(base_seed)
+    steps_per_episode = min(15, task_config['max_steps'])
+    for episode in range(args.num_prompts):
+        ep_config = copy.deepcopy(task_config)
+        ep_config['seed'] = base_seed + episode
+        env = OpenGridEnv(ep_config)
+        zone_obs = env.reset_multi()
+        # Adversarial injection: every 5th episode, drain batteries
+        if episode % 5 == 0:
+            for b in env.bus_state:
+                b_cfg = env._find_bus_config(b['id'])
+                if b_cfg and b_cfg['type'] == 'battery':
+                    b['soc'] = max(1.0, b['soc'] * 0.1)  # Near-empty
+        for t in range(steps_per_episode):
+            for agent_id, obs in zone_obs.items():
+                obs_dict = json.loads(obs.model_dump_json())
+                prompt_text = format_observation_prompt(obs_dict, zone_name=obs.zone_name)
+                messages = [
+                    {"role": "system", "content": SYSTEM_PROMPT},
+                    {"role": "user", "content": prompt_text},
+                ]
+                formatted = tokenizer.apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True
+                )
+                prompts.append(formatted)
+                obs_contexts.append(json.dumps(obs_dict))  # Store as string for Arrow compat
+            # Larger random perturbations for state diversity
+            random_actions = {}
+            for agent_id in range(env.num_agents):
+                zone_buses = task_config['zone_bus_ids'].get(agent_id, [])
+                controllable = [
+                    bid for bid in zone_buses
+                    if next((b for b in task_config['buses'] if b['id'] == bid), {}).get('type')
+                    in ['generator', 'battery']
+                ]
+                adj = []
+                if controllable:
+                    # Pick 1-2 buses with larger perturbations
+                    n_adj = min(len(controllable), rng.randint(1, 3))
+                    chosen = rng.choice(controllable, size=n_adj, replace=False)
+                    for bid in chosen:
+                        adj.append(BusAdjustment(
+                            bus_id=int(bid),
+                            delta=float(rng.uniform(-30, 30))  # Was ±15
+                        ))
+                random_actions[agent_id] = GridAction(bus_adjustments=adj)
+            result = env.step_multi(random_actions)
+            if result.done:
+                break
+            zone_obs = result.observations
+    print(f"[TRAIN] Generated {len(prompts)} training prompts")
+    # GRPO reward function: environment-grounded
+    def reward_fn(completions, obs_context=None, **kwargs):
+        """Environment-grounded GRPO reward.
+        Steps the actual physics simulation to score each action,
+        rather than using a disconnected heuristic proxy.
+        """
+        texts = []
+        for c in completions:
+            if isinstance(c, list):
+                text = c[-1]['content'] if c else ""
+            else:
+                text = str(c)
+            texts.append(text)
+        if obs_context is None:
+            obs_context = [None] * len(texts)
+        # Deserialize obs_context strings
+        obs_dicts = []
+        for ctx in obs_context:
+            if isinstance(ctx, str):
+                try:
+                    obs_dicts.append(json.loads(ctx))
+                except (json.JSONDecodeError, TypeError):
+                    obs_dicts.append(None)
+            else:
+                obs_dicts.append(ctx)
+        return compute_grpo_reward_env(texts, obs_dicts, task_config, horizon=3)
+    # GRPO Config — tuned for sustained learning signal
+    grpo_config = GRPOConfig(
+        output_dir=str(Path(args.output_dir) / "grpo_checkpoints"),
+        num_train_epochs=args.epochs,
+        per_device_train_batch_size=args.batch_size,
+        gradient_accumulation_steps=max(1, 16 // args.batch_size),
+        learning_rate=1e-5,           # Was 5e-6 — slightly more aggressive
+        logging_steps=5,
+        save_steps=50,
+        max_completion_length=256,
+        num_generations=8,            # Was 4 — wider group for better ranking signal
+        report_to="none",
+        remove_unused_columns=False,
+    )
+    # Create dataset — include obs_context so TRL passes it to reward_fn
+    from datasets import Dataset
+    train_dataset = Dataset.from_dict({
+        "prompt": prompts,
+        "obs_context": obs_contexts,
+    })
+    # Initialize trainer
+    trainer = GRPOTrainer(
+        model=model,
+        args=grpo_config,
+        train_dataset=train_dataset,
+        reward_funcs=reward_fn,
+        processing_class=tokenizer,
+    )
+    # Train
+    print("[TRAIN] Starting GRPO training...")
+    train_result = trainer.train()
+    # Save model
+    output_path = Path(args.output_dir) / "trained_model"
+    trainer.save_model(str(output_path))
+    tokenizer.save_pretrained(str(output_path))
+    print(f"[TRAIN] Model saved to {output_path}")
+    return train_result
+# ============================================================================
+# Evaluation & Plotting
+# ============================================================================
+def evaluate_model(generate_fn, task_ids=None, n_episodes=3, multi_agent=True):
+    """Evaluate a model across tasks. Returns per-task results.
+    Each episode uses a distinct seed to produce meaningful variance.
+    """
+    if task_ids is None:
+        task_ids = list(TASKS.keys())
+    results = {}
+    for task_id in task_ids:
+        base_config = TASKS[task_id]
+        base_seed = base_config.get('seed', 42)
+        episode_rewards = []
+        for ep in range(n_episodes):
+            # Vary seed per episode to get independent rollouts
+            ep_config = copy.deepcopy(base_config)
+            ep_config['seed'] = base_seed + ep
+            env = OpenGridEnv(ep_config)
+            if multi_agent:
+                data = rollout_multi_agent(env, generate_fn, ep_config)
+            else:
+                data = rollout_single_agent(env, generate_fn, ep_config)
+            episode_rewards.append(data['total_reward'])
+        results[task_id] = {
+            "avg_reward": np.mean(episode_rewards),
+            "std_reward": np.std(episode_rewards),
+            "rewards": episode_rewards,
+        }
+    return results
+def plot_training_curves(training_log: list, output_path: str):
+    """Generate reward curves from training log."""
+    if not training_log:
+        print("[PLOT] No training data to plot.")
+        return
+    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
+    # Reward curve
+    steps = range(len(training_log))
+    rewards = [entry.get('reward', 0) for entry in training_log]
+    axes[0].plot(steps, rewards, color='#00d4aa', linewidth=1.5, alpha=0.6, label='Step Reward')
+    # Smoothed reward
+    if len(rewards) > 10:
+        window = min(20, len(rewards) // 5)
+        smoothed = np.convolve(rewards, np.ones(window)/window, mode='valid')
+        axes[0].plot(range(window-1, len(rewards)), smoothed, color='#00d4aa',
+                     linewidth=2.5, label=f'Smoothed (window={window})')
+    axes[0].axhline(y=0, color='gray', linestyle='--', alpha=0.5)
+    axes[0].set_xlabel('Training Step')
+    axes[0].set_ylabel('Reward')
+    axes[0].set_title('GRPO Training — Reward Curve')
+    axes[0].legend()
+    axes[0].grid(True, alpha=0.3)
+    # Loss curve (if available)
+    losses = [entry.get('loss', 0) for entry in training_log if 'loss' in entry]
+    if losses:
+        axes[1].plot(range(len(losses)), losses, color='#ff6b6b', linewidth=1.5)
+        axes[1].set_xlabel('Training Step')
+        axes[1].set_ylabel('Loss')
+        axes[1].set_title('Training Loss')
+        axes[1].grid(True, alpha=0.3)
+    else:
+        axes[1].text(0.5, 0.5, 'Loss data not available', ha='center', va='center',
+                     transform=axes[1].transAxes, fontsize=14, color='gray')
+        axes[1].set_title('Training Loss')
+    plt.tight_layout()
+    plt.savefig(output_path, dpi=150, bbox_inches='tight')
+    plt.close()
+    print(f"[PLOT] Saved training curves to {output_path}")
+def plot_before_after(before_results: dict, after_results: dict, output_path: str):
+    """Generate before/after comparison chart."""
+    fig, ax = plt.subplots(figsize=(10, 6))
+    tasks = list(before_results.keys())
+    x = np.arange(len(tasks))
+    width = 0.35
+    before_vals = [before_results[t]['avg_reward'] for t in tasks]
+    after_vals = [after_results[t]['avg_reward'] for t in tasks]
+    bars1 = ax.bar(x - width/2, before_vals, width, label='Before Training',
+                   color='#ff6b6b', alpha=0.8)
+    bars2 = ax.bar(x + width/2, after_vals, width, label='After Training',
+                   color='#00d4aa', alpha=0.8)
+    ax.set_xlabel('Task')
+    ax.set_ylabel('Average Episode Reward')
+    ax.set_title('OpenGrid — GRPO Training: Before vs After')
+    ax.set_xticks(x)
+    ax.set_xticklabels([t.replace('task_', '').title() for t in tasks])
+    ax.legend()
+    ax.grid(True, alpha=0.3, axis='y')
+    # Add value labels on bars (handle negative heights)
+    for bar in list(bars1) + list(bars2):
+        h = bar.get_height()
+        va = 'bottom' if h >= 0 else 'top'
+        offset = 1 if h >= 0 else -1
+        ax.text(bar.get_x() + bar.get_width()/2., h + offset,
+                f'{h:.1f}', ha='center', va=va, fontsize=9)
+    plt.tight_layout()
+    plt.savefig(output_path, dpi=150, bbox_inches='tight')
+    plt.close()
+    print(f"[PLOT] Saved before/after comparison to {output_path}")
+# ============================================================================
+# Test Mode
+# ============================================================================
+def run_test_mode():
+    """Quick pipeline verification without GPU. Runs a few episodes with heuristic."""
+    print("\n" + "="*60)
+    print("  OpenGrid GRPO Training — TEST MODE")
+    print("  (Verifies the pipeline without training)")
+    print("="*60 + "\n")
+    # Test 1: Prompt generation
+    print("[TEST] Generating prompts...")
+    env = OpenGridEnv(TASKS["task_easy"])
+    zone_obs = env.reset_multi()
+    for agent_id, obs in zone_obs.items():
+        prompt = format_observation_prompt(obs.model_dump(), zone_name=obs.zone_name)
+        print(f"\n--- Agent {agent_id} ({obs.zone_name}) ---")
+        print(prompt[:500])
+    # Test 2: Action extraction
+    print("\n[TEST] Testing action extraction...")
+    test_cases = [
+        '{"bus_adjustments": [{"bus_id": 1, "delta": 5.0}], "topology_actions": []}',
+        'Here is my action: {"bus_adjustments": [], "topology_actions": []}',
+        'invalid garbage',
+    ]
+    for tc in test_cases:
+        action = extract_action(tc)
+        print(f"  Input: {tc[:60]}... -> {len(action.bus_adjustments)} adjustments")
+    # Test 3: Multi-agent rollout with heuristic
+    print("\n[TEST] Running multi-agent rollout...")
+    from src.baseline import heuristic_policy
+    def heuristic_generate(prompt):
+        """Pseudo-LLM: use heuristic policy and format as JSON."""
+        # Extract frequency from prompt (handles negative/signed values)
+        freq_match = re.search(r'Frequency:\s*([-+]?\d+(?:\.\d+)?)', prompt)
+        freq = float(freq_match.group(1)) if freq_match else 50.0
+        # Simple proportional control
+        error = 50.0 - freq
+        delta = error * 10  # proportional gain
+        delta = max(-20, min(20, delta))
+        # Find controllable buses (generator/battery, NOT slack — physics overwrites it)
+        bus_matches = re.findall(r'Bus (\d+) \((generator|battery)\)', prompt)
+        if bus_matches:
+            # Distribute across all controllable buses
+            per_bus = delta / len(bus_matches)
+            adjustments = [
+                {"bus_id": int(m[0]), "delta": round(per_bus, 1)}
+                for m in bus_matches
+            ]
+            return json.dumps({
+                "bus_adjustments": adjustments,
+                "topology_actions": []
+            })
+        return json.dumps({"bus_adjustments": [], "topology_actions": []})
+    for task_id in ["task_easy", "task_medium"]:
+        config = copy.deepcopy(TASKS[task_id])
+        env = OpenGridEnv(config)
+        result = rollout_multi_agent(env, heuristic_generate, config)
+        print(f"  {task_id}: reward={result['total_reward']:.2f}, "
+              f"steps={result['steps']}, blackout={result['is_blackout']}, "
+              f"safety_interventions={result['safety_interventions']}")
+    # Test 4: Reward function
+    print("\n[TEST] Testing GRPO reward function...")
+    test_completions = [
+        '{"bus_adjustments": [{"bus_id": 1, "delta": 5.0}], "topology_actions": []}',
+        '{"bus_adjustments": [], "topology_actions": []}',
+        'not valid json at all',
+    ]
+    test_obs = [{"grid_frequency": 49.5}, {"grid_frequency": 50.0}, {"grid_frequency": 50.3}]
+    grpo_rewards = compute_grpo_reward(test_completions, test_obs)
+    for tc, r in zip(test_completions, grpo_rewards):
+        print(f"  Reward: {r:.2f} for: {tc[:50]}...")
+    # Test 5: Generate plots
+    output_dir = Path("training/outputs")
+    output_dir.mkdir(parents=True, exist_ok=True)
+    fake_log = [{"reward": np.random.normal(0.5, 0.3) + i * 0.01, "loss": 2.0 - i * 0.02}
+                for i in range(100)]
+    plot_training_curves(fake_log, str(output_dir / "test_training_curves.png"))
+    fake_before = {t: {"avg_reward": np.random.uniform(20, 35)} for t in TASKS}
+    fake_after = {t: {"avg_reward": np.random.uniform(40, 55)} for t in TASKS}
+    plot_before_after(fake_before, fake_after, str(output_dir / "test_before_after.png"))
+    print("\n" + "="*60)
+    print("  [OK] ALL TESTS PASSED - Pipeline is ready for GPU training")
+    print("="*60)
+# ============================================================================
+# Main
+# ============================================================================
+def main():
+    parser = argparse.ArgumentParser(description="OpenGrid GRPO Training")
+    parser.add_argument("--model", default="Qwen/Qwen2.5-1.5B-Instruct",
+                        help="HuggingFace model name or path")
+    parser.add_argument("--task", default="task_easy", choices=list(TASKS.keys()),
+                        help="Which task to train on")
+    parser.add_argument("--epochs", type=int, default=3, help="Number of training epochs")
+    parser.add_argument("--batch-size", type=int, default=2, help="Batch size per device")
+    parser.add_argument("--num-prompts", type=int, default=50,
+                        help="Number of episodes to generate prompts from")
+    parser.add_argument("--output-dir", default="training/outputs",
+                        help="Directory for checkpoints and plots")
+    parser.add_argument("--use-unsloth", action="store_true",
+                        help="Use Unsloth for 4-bit quantized training")
+    parser.add_argument("--test-mode", action="store_true",
+                        help="Run pipeline verification without GPU")
+    args = parser.parse_args()
+    if args.test_mode:
+        run_test_mode()
+        return
+    # Create output directory
+    Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+    # Run training
+    train_result = train_grpo(args)
+    print("\n[DONE] Training complete!")
+    print(f"  Output: {args.output_dir}")
+if __name__ == "__main__":
+    main()

validate-submission.sh ADDED Viewed

	@@ -0,0 +1,103 @@

+#!/usr/bin/env bash
+set -uo pipefail
+DOCKER_BUILD_TIMEOUT=600
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BOLD='\033[1m'
+NC='\033[0m'
+PING_URL="${1:-}"
+REPO_DIR="${2:-.}"
+if [ -z "$PING_URL" ]; then
+  printf "Usage: %s <ping_url> [repo_dir]\n" "$0"
+  exit 1
+fi
+REPO_DIR="$(cd "$REPO_DIR" 2>/dev/null && pwd)"
+PING_URL="${PING_URL%/}"
+PASS=0
+log()  { printf "[%s] %b\n" "$(date -u +%H:%M:%S)" "$*"; }
+pass() { log "${GREEN}PASSED${NC} -- $1"; PASS=$((PASS + 1)); }
+fail() { log "${RED}FAILED${NC} -- $1"; }
+hint() { printf "  ${YELLOW}Hint:${NC} %b\n" "$1"; }
+stop_at() {
+  printf "\n${RED}${BOLD}Validation stopped at %s.${NC} Fix the above before continuing.\n" "$1"
+  exit 1
+}
+printf "\n${BOLD}========================================${NC}\n"
+printf "${BOLD}  OpenEnv Submission Validator${NC}\n"
+printf "${BOLD}========================================${NC}\n"
+log "Repo:     $REPO_DIR"
+log "Ping URL: $PING_URL"
+printf "\n"
+log "${BOLD}Step 1/3: Pinging HF Space${NC} ($PING_URL/reset) ..."
+HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST \
+  -H "Content-Type: application/json" -d '{}' \
+  "$PING_URL/reset" --max-time 30 2>/dev/null || printf "000")
+if [ "$HTTP_CODE" = "200" ]; then
+  pass "HF Space is live and responds to /reset"
+elif [ "$HTTP_CODE" = "000" ]; then
+  fail "HF Space not reachable (connection failed or timed out)"
+  stop_at "Step 1"
+else
+  fail "HF Space /reset returned HTTP $HTTP_CODE (expected 200)"
+  stop_at "Step 1"
+fi
+log "${BOLD}Step 2/3: Running docker build${NC} ..."
+if ! command -v docker &>/dev/null; then
+  fail "docker command not found"
+  stop_at "Step 2"
+fi
+if [ -f "$REPO_DIR/Dockerfile" ]; then
+  DOCKER_CONTEXT="$REPO_DIR"
+elif [ -f "$REPO_DIR/server/Dockerfile" ]; then
+  DOCKER_CONTEXT="$REPO_DIR/server"
+else
+  fail "No Dockerfile found"
+  stop_at "Step 2"
+fi
+log "  Found Dockerfile in $DOCKER_CONTEXT"
+BUILD_OK=false
+BUILD_OUTPUT=$(timeout "$DOCKER_BUILD_TIMEOUT" docker build "$DOCKER_CONTEXT" 2>&1) && BUILD_OK=true
+if [ "$BUILD_OK" = true ]; then
+  pass "Docker build succeeded"
+else
+  fail "Docker build failed"
+  printf "%s\n" "$BUILD_OUTPUT" | tail -20
+  stop_at "Step 2"
+fi
+log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
+if ! command -v openenv &>/dev/null; then
+  fail "openenv command not found"
+  hint "Install it: pip install openenv-core"
+  stop_at "Step 3"
+fi
+VALIDATE_OK=false
+VALIDATE_OUTPUT=$(cd "$REPO_DIR" && openenv validate 2>&1) && VALIDATE_OK=true
+if [ "$VALIDATE_OK" = true ]; then
+  pass "openenv validate passed"
+  [ -n "$VALIDATE_OUTPUT" ] && log "  $VALIDATE_OUTPUT"
+else
+  fail "openenv validate failed"
+  printf "%s\n" "$VALIDATE_OUTPUT"
+  stop_at "Step 3"
+fi
+printf "\n${BOLD}========================================${NC}\n"
+printf "${GREEN}${BOLD}  All 3/3 checks passed!${NC}\n"
+printf "${BOLD}========================================${NC}\n\n"
+exit 0