Spaces:

mahammadaftab
/

CivicAI

Sleeping

App Files Files Community

CivicAI / scripts /evaluate.py

mahammadaftab

Initial Update

315caa2 12 days ago

raw

history blame contribute delete

6.25 kB

	"""
	CivicAI Evaluation & Metrics Script

	Runs multiple episodes across all tasks, computes metrics,
	generates reward curve plots and comparison tables.
	"""

	from __future__ import annotations

	import json
	import os
	import sys

	sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

	import matplotlib
	matplotlib.use("Agg")
	import matplotlib.pyplot as plt
	import numpy as np

	from civicai.environment import CivicAIEnv
	from civicai.models import Action, SubsidyPolicy
	from agents.orchestrator import Orchestrator


	def run_evaluation(episodes_per_task: int = 5) -> dict:
	"""Run comprehensive evaluation across all tasks and agent types."""
	tasks = ["stabilize_economy", "manage_pandemic", "control_crisis"]
	results = {}

	for task_id in tasks:
	print(f"\n{'='*50}")
	print(f" Evaluating: {task_id}")
	print(f"{'='*50}")

	# Multi-agent baseline
	agent_rewards = []
	for ep in range(episodes_per_task):
	env = CivicAIEnv()
	orch = Orchestrator(env)
	result = orch.run_episode(task_id)
	agent_rewards.append(result["avg_reward"])
	print(f" Agent ep{ep}: avg_reward={result['avg_reward']:.4f}")

	# Random baseline
	random_rewards = []
	import random
	for ep in range(episodes_per_task):
	env = CivicAIEnv()
	obs = env.reset(task_id)
	total_r = 0.0
	count = 0
	for _ in range(50):
	action = Action(
	tax_rate=random.uniform(0.1, 0.5),
	healthcare_budget=random.uniform(0.05, 0.4),
	education_budget=random.uniform(0.05, 0.3),
	police_budget=random.uniform(0.03, 0.2),
	subsidy_policy=random.choice(list(SubsidyPolicy)),
	)
	obs, r, done, _ = env.step(action)
	total_r += r
	count += 1
	if done:
	break
	random_rewards.append(total_r / max(1, count))

	results[task_id] = {
	"agent_mean": round(float(np.mean(agent_rewards)), 4),
	"agent_std": round(float(np.std(agent_rewards)), 4),
	"random_mean": round(float(np.mean(random_rewards)), 4),
	"random_std": round(float(np.std(random_rewards)), 4),
	"improvement": round(float(np.mean(agent_rewards) - np.mean(random_rewards)), 4),
	}

	return results


	def generate_plots(results: dict) -> None:
	"""Generate evaluation plots."""
	os.makedirs("assets", exist_ok=True)

	# --- Comparison Bar Chart ---
	fig, ax = plt.subplots(figsize=(10, 6))
	fig.patch.set_facecolor("#0f172a")
	ax.set_facecolor("#1e293b")

	tasks = list(results.keys())
	x = np.arange(len(tasks))
	width = 0.35

	agent_means = [results[t]["agent_mean"] for t in tasks]
	random_means = [results[t]["random_mean"] for t in tasks]
	agent_stds = [results[t]["agent_std"] for t in tasks]
	random_stds = [results[t]["random_std"] for t in tasks]

	bars1 = ax.bar(x - width/2, random_means, width, yerr=random_stds,
	label="Random", color="#ef4444", alpha=0.8, capsize=4)
	bars2 = ax.bar(x + width/2, agent_means, width, yerr=agent_stds,
	label="Multi-Agent", color="#06b6d4", alpha=0.8, capsize=4)

	ax.set_ylabel("Avg Reward", color="white", fontsize=12)
	ax.set_title("CivicAI: Agent vs Random Performance", color="white", fontsize=14, fontweight="bold")
	ax.set_xticks(x)
	task_labels = ["Economic\nStability", "Pandemic\nManagement", "Social\nCrisis"]
	ax.set_xticklabels(task_labels, color="white")
	ax.tick_params(colors="white")
	ax.legend(facecolor="#1e293b", edgecolor="#334155", labelcolor="white")
	ax.spines["bottom"].set_color("#334155")
	ax.spines["left"].set_color("#334155")
	ax.spines["top"].set_visible(False)
	ax.spines["right"].set_visible(False)
	ax.set_ylim(0, 1)

	plt.tight_layout()
	plt.savefig("assets/comparison_chart.png", dpi=150, facecolor="#0f172a")
	plt.close()
	print(" Saved: assets/comparison_chart.png")

	# --- Reward Curve ---
	fig, ax = plt.subplots(figsize=(10, 5))
	fig.patch.set_facecolor("#0f172a")
	ax.set_facecolor("#1e293b")

	env = CivicAIEnv()
	orch = Orchestrator(env)
	result = orch.run_episode("stabilize_economy")
	curve = result["reward_curve"]

	ax.plot(curve, color="#06b6d4", linewidth=2, label="Multi-Agent Reward")
	ax.fill_between(range(len(curve)), curve, alpha=0.15, color="#06b6d4")
	ax.axhline(y=np.mean(curve), color="#a855f7", linestyle="--", alpha=0.7, label=f"Mean: {np.mean(curve):.3f}")

	ax.set_xlabel("Turn", color="white", fontsize=12)
	ax.set_ylabel("Reward", color="white", fontsize=12)
	ax.set_title("CivicAI: Reward Curve (Economic Stability)", color="white", fontsize=14, fontweight="bold")
	ax.tick_params(colors="white")
	ax.legend(facecolor="#1e293b", edgecolor="#334155", labelcolor="white")
	ax.spines["bottom"].set_color("#334155")
	ax.spines["left"].set_color("#334155")
	ax.spines["top"].set_visible(False)
	ax.spines["right"].set_visible(False)

	plt.tight_layout()
	plt.savefig("assets/reward_curve.png", dpi=150, facecolor="#0f172a")
	plt.close()
	print(" Saved: assets/reward_curve.png")


	def print_results_table(results: dict) -> None:
	"""Print a formatted results table."""
	print(f"\n{'='*70}")
	print(f" {'Task':<25} {'Random':>10} {'Agent':>10} {'Improve':>10}")
	print(f" {'-'*55}")
	for task_id, r in results.items():
	print(f" {task_id:<25} {r['random_mean']:>10.4f} {r['agent_mean']:>10.4f} {r['improvement']:>+10.4f}")
	print(f"{'='*70}")


	if __name__ == "__main__":
	episodes = int(sys.argv[1]) if len(sys.argv) > 1 else 3

	print("\n[CivicAI] Evaluation Suite\n")
	results = run_evaluation(episodes)
	print_results_table(results)
	generate_plots(results)

	# Save results
	os.makedirs("assets", exist_ok=True)
	with open("assets/evaluation_results.json", "w") as f:
	json.dump(results, f, indent=2)
	print("\n Results saved to assets/evaluation_results.json")