CivicAI / scripts /evaluate.py
mahammadaftab's picture
Initial Update
315caa2
"""
CivicAI Evaluation & Metrics Script
Runs multiple episodes across all tasks, computes metrics,
generates reward curve plots and comparison tables.
"""
from __future__ import annotations
import json
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import numpy as np
from civicai.environment import CivicAIEnv
from civicai.models import Action, SubsidyPolicy
from agents.orchestrator import Orchestrator
def run_evaluation(episodes_per_task: int = 5) -> dict:
"""Run comprehensive evaluation across all tasks and agent types."""
tasks = ["stabilize_economy", "manage_pandemic", "control_crisis"]
results = {}
for task_id in tasks:
print(f"\n{'='*50}")
print(f" Evaluating: {task_id}")
print(f"{'='*50}")
# Multi-agent baseline
agent_rewards = []
for ep in range(episodes_per_task):
env = CivicAIEnv()
orch = Orchestrator(env)
result = orch.run_episode(task_id)
agent_rewards.append(result["avg_reward"])
print(f" Agent ep{ep}: avg_reward={result['avg_reward']:.4f}")
# Random baseline
random_rewards = []
import random
for ep in range(episodes_per_task):
env = CivicAIEnv()
obs = env.reset(task_id)
total_r = 0.0
count = 0
for _ in range(50):
action = Action(
tax_rate=random.uniform(0.1, 0.5),
healthcare_budget=random.uniform(0.05, 0.4),
education_budget=random.uniform(0.05, 0.3),
police_budget=random.uniform(0.03, 0.2),
subsidy_policy=random.choice(list(SubsidyPolicy)),
)
obs, r, done, _ = env.step(action)
total_r += r
count += 1
if done:
break
random_rewards.append(total_r / max(1, count))
results[task_id] = {
"agent_mean": round(float(np.mean(agent_rewards)), 4),
"agent_std": round(float(np.std(agent_rewards)), 4),
"random_mean": round(float(np.mean(random_rewards)), 4),
"random_std": round(float(np.std(random_rewards)), 4),
"improvement": round(float(np.mean(agent_rewards) - np.mean(random_rewards)), 4),
}
return results
def generate_plots(results: dict) -> None:
"""Generate evaluation plots."""
os.makedirs("assets", exist_ok=True)
# --- Comparison Bar Chart ---
fig, ax = plt.subplots(figsize=(10, 6))
fig.patch.set_facecolor("#0f172a")
ax.set_facecolor("#1e293b")
tasks = list(results.keys())
x = np.arange(len(tasks))
width = 0.35
agent_means = [results[t]["agent_mean"] for t in tasks]
random_means = [results[t]["random_mean"] for t in tasks]
agent_stds = [results[t]["agent_std"] for t in tasks]
random_stds = [results[t]["random_std"] for t in tasks]
bars1 = ax.bar(x - width/2, random_means, width, yerr=random_stds,
label="Random", color="#ef4444", alpha=0.8, capsize=4)
bars2 = ax.bar(x + width/2, agent_means, width, yerr=agent_stds,
label="Multi-Agent", color="#06b6d4", alpha=0.8, capsize=4)
ax.set_ylabel("Avg Reward", color="white", fontsize=12)
ax.set_title("CivicAI: Agent vs Random Performance", color="white", fontsize=14, fontweight="bold")
ax.set_xticks(x)
task_labels = ["Economic\nStability", "Pandemic\nManagement", "Social\nCrisis"]
ax.set_xticklabels(task_labels, color="white")
ax.tick_params(colors="white")
ax.legend(facecolor="#1e293b", edgecolor="#334155", labelcolor="white")
ax.spines["bottom"].set_color("#334155")
ax.spines["left"].set_color("#334155")
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.set_ylim(0, 1)
plt.tight_layout()
plt.savefig("assets/comparison_chart.png", dpi=150, facecolor="#0f172a")
plt.close()
print(" Saved: assets/comparison_chart.png")
# --- Reward Curve ---
fig, ax = plt.subplots(figsize=(10, 5))
fig.patch.set_facecolor("#0f172a")
ax.set_facecolor("#1e293b")
env = CivicAIEnv()
orch = Orchestrator(env)
result = orch.run_episode("stabilize_economy")
curve = result["reward_curve"]
ax.plot(curve, color="#06b6d4", linewidth=2, label="Multi-Agent Reward")
ax.fill_between(range(len(curve)), curve, alpha=0.15, color="#06b6d4")
ax.axhline(y=np.mean(curve), color="#a855f7", linestyle="--", alpha=0.7, label=f"Mean: {np.mean(curve):.3f}")
ax.set_xlabel("Turn", color="white", fontsize=12)
ax.set_ylabel("Reward", color="white", fontsize=12)
ax.set_title("CivicAI: Reward Curve (Economic Stability)", color="white", fontsize=14, fontweight="bold")
ax.tick_params(colors="white")
ax.legend(facecolor="#1e293b", edgecolor="#334155", labelcolor="white")
ax.spines["bottom"].set_color("#334155")
ax.spines["left"].set_color("#334155")
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
plt.tight_layout()
plt.savefig("assets/reward_curve.png", dpi=150, facecolor="#0f172a")
plt.close()
print(" Saved: assets/reward_curve.png")
def print_results_table(results: dict) -> None:
"""Print a formatted results table."""
print(f"\n{'='*70}")
print(f" {'Task':<25} {'Random':>10} {'Agent':>10} {'Improve':>10}")
print(f" {'-'*55}")
for task_id, r in results.items():
print(f" {task_id:<25} {r['random_mean']:>10.4f} {r['agent_mean']:>10.4f} {r['improvement']:>+10.4f}")
print(f"{'='*70}")
if __name__ == "__main__":
episodes = int(sys.argv[1]) if len(sys.argv) > 1 else 3
print("\n[CivicAI] Evaluation Suite\n")
results = run_evaluation(episodes)
print_results_table(results)
generate_plots(results)
# Save results
os.makedirs("assets", exist_ok=True)
with open("assets/evaluation_results.json", "w") as f:
json.dump(results, f, indent=2)
print("\n Results saved to assets/evaluation_results.json")