Spaces:

mahammadaftab
/

CivicAI

Sleeping

App Files Files Community

CivicAI / scripts /train_ppo.py

mahammadaftab

Final updated

6298125 11 days ago

raw

history blame contribute delete

10.9 kB

	"""
	CivicAI TRL PPO Training Script — scripts/train_ppo.py
	=======================================================
	Full training pipeline using HuggingFace TRL.
	LLM (GPT-2) receives society state as text → outputs JSON action.
	PPO optimises the LLM against the CivicAI environment reward.
	"""
	from __future__ import annotations

	import os, sys, json, random
	import numpy as np
	import torch
	import matplotlib
	matplotlib.use("Agg")
	import matplotlib.pyplot as plt
	from tqdm import tqdm
	from transformers import AutoTokenizer
	from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead

	sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
	from civicai.environment import CivicAIEnv
	from civicai.models import Action, SubsidyPolicy
	from civicai.reward import get_named_scores, compute_reward

	# ── Config ────────────────────────────────────────────────────────────────────
	MODEL_NAME = "gpt2" # swap for "meta-llama/Llama-3.2-1B" on Colab A100
	TASK_ID = "stabilize_economy"
	N_EPISODES = 20 # episodes to train
	STEPS_EP = 50 # max steps per episode
	BATCH_SIZE = 1
	LR = 1.41e-5
	SEED = 42

	random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

	DARK, PANEL, GRID = "#0f172a", "#1e293b", "#334155"


	# ── Prompt / Parser ───────────────────────────────────────────────────────────

	def obs_to_prompt(obs: dict) -> str:
	return (
	f"You are a policy advisor. State: Turn={obs['turn']}, "
	f"GDP=${obs['gdp']:.0f}B, Inflation={obs['inflation']:.1%}, "
	f"Employment={obs['employment_rate']:.1%}, "
	f"Satisfaction={obs['public_satisfaction']:.1%}, "
	f"Health={obs['health_index']:.1%}, Crime={obs['crime_rate']:.1%}. "
	f"Output JSON: {{\"tax_rate\":0.0-1.0,\"healthcare_budget\":0.0-1.0,"
	f"\"education_budget\":0.0-1.0,\"police_budget\":0.0-1.0,"
	f"\"subsidy_policy\":\"none\|agriculture\|industry\|technology\"}} Action:"
	)


	def parse_action(text: str) -> Action:
	try:
	s, e = text.find("{"), text.rfind("}")
	if s != -1 and e != -1:
	d = json.loads(text[s:e+1])
	return Action(
	tax_rate=max(0.0, min(1.0, float(d.get("tax_rate", 0.25)))),
	healthcare_budget=max(0.0, min(1.0, float(d.get("healthcare_budget", 0.20)))),
	education_budget=max(0.0, min(1.0, float(d.get("education_budget", 0.15)))),
	police_budget=max(0.0, min(1.0, float(d.get("police_budget", 0.10)))),
	subsidy_policy=SubsidyPolicy(d.get("subsidy_policy", "none")),
	)
	except Exception:
	pass
	return Action(
	tax_rate=random.uniform(0.2, 0.4),
	healthcare_budget=random.uniform(0.1, 0.3),
	education_budget=random.uniform(0.05, 0.2),
	police_budget=random.uniform(0.05, 0.15),
	)


	# ── Random Baseline ───────────────────────────────────────────────────────────

	def run_random_baseline(n: int = 5) -> float:
	rewards = []
	env = CivicAIEnv()
	for seed in range(n):
	rng = random.Random(seed)
	env.reset(task_id=TASK_ID, seed=seed)
	ep = []
	for _ in range(STEPS_EP):
	a = Action(
	tax_rate=rng.uniform(0.15, 0.5),
	healthcare_budget=rng.uniform(0.08, 0.35),
	education_budget=rng.uniform(0.05, 0.25),
	police_budget=rng.uniform(0.03, 0.18),
	)
	_, r, done, _ = env.step(a)
	ep.append(r)
	if done:
	break
	rewards.append(float(np.mean(ep)))
	return float(np.mean(rewards))


	# ── Main Training ─────────────────────────────────────────────────────────────

	def train_ppo():
	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"[CivicAI] TRL PPO Training \| model={MODEL_NAME} device={device}")

	# Models
	config = PPOConfig(
	model_name=MODEL_NAME,
	learning_rate=LR,
	batch_size=BATCH_SIZE,
	mini_batch_size=1,
	gradient_accumulation_steps=1,
	log_with=None,
	)
	model = AutoModelForCausalLMWithValueHead.from_pretrained(MODEL_NAME).to(device)
	ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(MODEL_NAME).to(device)
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
	tokenizer.pad_token = tokenizer.eos_token
	ppo = PPOTrainer(config, model, ref_model, tokenizer)

	gen_kwargs = dict(
	max_new_tokens=80, do_sample=True, top_k=50, top_p=0.95,
	pad_token_id=tokenizer.eos_token_id,
	)

	env = CivicAIEnv()

	# Baseline
	print("[CivicAI] Computing random baseline...")
	baseline_avg = run_random_baseline(5)
	print(f" Random baseline avg reward: {baseline_avg:.4f}")

	# Training
	episode_rewards, episode_components = [], []
	print(f"[CivicAI] Training for {N_EPISODES} episodes...")

	for ep in range(N_EPISODES):
	obs = env.reset(task_id=TASK_ID, seed=ep)
	ep_rewards, ep_comp = [], []

	for step in tqdm(range(STEPS_EP), desc=f"Ep {ep+1}/{N_EPISODES}", leave=False):
	prompt = obs_to_prompt(obs.model_dump())
	query = tokenizer.encode(prompt, return_tensors="pt").to(device)[0]

	response = ppo.generate(query.unsqueeze(0), **gen_kwargs)
	response_ids = response[0][len(query):]
	text = tokenizer.decode(response_ids, skip_special_tokens=True)

	action = parse_action(text)
	obs, reward, done, info = env.step(action)

	# Named component scores
	state = env.state()
	robj = compute_reward(state, action)
	ep_comp.append(get_named_scores(robj))

	reward_t = torch.tensor([reward], dtype=torch.float).to(device)
	ppo.step([query], [response_ids], [reward_t])

	ep_rewards.append(reward)
	if done:
	break

	avg_r = float(np.mean(ep_rewards))
	episode_rewards.append(avg_r)
	episode_components.append({
	k: round(float(np.mean([c[k] for c in ep_comp])), 4)
	for k in ep_comp[0]
	})
	print(f" Ep {ep+1:2d}: avg_reward={avg_r:.4f} "
	+ " ".join(f"{k}={v:.3f}" for k, v in episode_components[-1].items()))

	# ── Save model ────────────────────────────────────────────────────────────
	os.makedirs("assets", exist_ok=True)
	model.save_pretrained("assets/civicai_ppo_model")
	tokenizer.save_pretrained("assets/civicai_ppo_model")
	print("\n Model saved to assets/civicai_ppo_model/")

	# ── Save JSON results ─────────────────────────────────────────────────────
	results = {
	"baseline_avg": baseline_avg,
	"episode_rewards": episode_rewards,
	"episode_components": episode_components,
	"final_avg": float(np.mean(episode_rewards[-5:])),
	"improvement": float(np.mean(episode_rewards[-5:])) - baseline_avg,
	}
	with open("assets/training_results.json", "w") as f:
	json.dump(results, f, indent=2)

	# ── Plots ─────────────────────────────────────────────────────────────────
	_plot_training_curve(episode_rewards, baseline_avg)
	_plot_component_breakdown(episode_components)

	print("\n[CivicAI] Training complete.")
	print(f" Baseline avg: {baseline_avg:.4f}")
	print(f" Final 5-ep avg: {results['final_avg']:.4f}")
	print(f" Improvement: {results['improvement']:+.4f}")
	return results


	def _plot_training_curve(rewards: list[float], baseline: float) -> None:
	smooth = np.convolve(rewards, np.ones(3)/3, mode="valid")
	fig, ax = plt.subplots(figsize=(10, 5))
	fig.patch.set_facecolor(DARK); ax.set_facecolor(PANEL)
	ax.plot(rewards, color="#06b6d4", alpha=0.4, linewidth=1)
	ax.plot(range(len(smooth)), smooth, color="#06b6d4", linewidth=2.5,
	label=f"PPO Agent (final={rewards[-1]:.3f})")
	ax.axhline(baseline, color="#ef4444", linestyle="--", linewidth=1.8,
	label=f"Random Baseline ({baseline:.3f})")
	ax.fill_between(range(len(smooth)), smooth, baseline,
	where=[s > baseline for s in smooth],
	alpha=0.15, color="#06b6d4", label="Improvement over baseline")
	ax.set_ylim(0, 1.05)
	ax.set_xlabel("Episode", color="#94a3b8"); ax.set_ylabel("Avg Step Reward", color="#94a3b8")
	ax.set_title("CivicAI TRL PPO — Training Curve", color="#e2e8f0", fontsize=14, fontweight="bold")
	ax.tick_params(colors="#94a3b8")
	for sp in ax.spines.values(): sp.set_edgecolor(GRID)
	ax.grid(axis="y", color=GRID, linewidth=0.5, linestyle="--")
	ax.legend(facecolor=PANEL, edgecolor=GRID, labelcolor="#e2e8f0")
	plt.tight_layout()
	plt.savefig("assets/reward_curve.png", dpi=150, facecolor=DARK)
	plt.close()
	print(" Saved: assets/reward_curve.png")


	def _plot_component_breakdown(components: list[dict]) -> None:
	keys = ["economic_score", "health_score", "satisfaction_score", "crime_score"]
	colors = ["#f59e0b", "#10b981", "#a78bfa", "#f97316"]
	fig, axes = plt.subplots(1, 4, figsize=(16, 4))
	fig.patch.set_facecolor(DARK)
	fig.suptitle("Named Reward Components Over Training", color="#e2e8f0",
	fontsize=13, fontweight="bold")
	for ax, key, col in zip(axes, keys, colors):
	vals = [c[key] for c in components]
	ax.set_facecolor(PANEL)
	ax.plot(vals, color=col, linewidth=2)
	ax.fill_between(range(len(vals)), vals, alpha=0.15, color=col)
	ax.set_ylim(0, 1.05)
	ax.set_title(key.replace("_score", "").capitalize(), color="#e2e8f0", fontsize=11)
	ax.tick_params(colors="#94a3b8", labelsize=8)
	for sp in ax.spines.values(): sp.set_edgecolor(GRID)
	ax.grid(color=GRID, linewidth=0.4, linestyle="--")
	plt.tight_layout()
	plt.savefig("assets/component_scores.png", dpi=150, facecolor=DARK)
	plt.close()
	print(" Saved: assets/component_scores.png")


	if __name__ == "__main__":
	train_ppo()