| |
| """ |
| Track 2: REAL MODEL validation β Groq + Qwen3-32B. |
| |
| Runs the self-improvement loop with an actual LLM, not mocks. |
| Proves Purpose Learning works with real inference. |
| |
| Usage: |
| export GROQ_API_KEY="gsk_..." |
| python benchmarks/validate_real.py |
| """ |
| import sys, os, json, time |
| sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) |
|
|
| from purpose_agent.types import State, Action |
| from purpose_agent.llm_backend import resolve_backend, ChatMessage |
| from purpose_agent.orchestrator import Environment, Orchestrator |
|
|
| GROQ_KEY = os.environ.get("GROQ_API_KEY", "") |
| if not GROQ_KEY: |
| print("Set GROQ_API_KEY to run this benchmark.") |
| sys.exit(1) |
|
|
| MODEL = "groq:llama-3.3-70b-versatile" |
|
|
|
|
| |
|
|
| class CodeEnv(Environment): |
| def __init__(self, tests): |
| self.tests = tests |
|
|
| def execute(self, action, current_state): |
| code = action.params.get("code", action.thought or "") |
| |
| if not code.strip() or "def " not in code: |
| for field in [action.expected_delta, action.thought]: |
| if field and "def " in field: |
| code = field |
| break |
|
|
| data = {**current_state.data, "attempts": current_state.data.get("attempts", 0) + 1} |
| passed, fails = 0, [] |
| for tc in self.tests: |
| try: |
| ns = {} |
| exec(code, ns) |
| result = str(eval(tc["input"], ns)) |
| if result.strip() == str(tc["expected"]).strip(): |
| passed += 1 |
| else: |
| fails.append(f'{tc["input"]}: want {tc["expected"]}, got {result}') |
| except Exception as e: |
| fails.append(f'{tc["input"]}: {type(e).__name__}: {e}') |
|
|
| total = len(self.tests) |
| data.update({ |
| "tests_passed": passed, "tests_total": total, |
| "pass_rate": passed / total if total else 0, |
| "all_passed": passed == total, |
| "failures": fails[:3], "last_code": code[:500], |
| }) |
| summary = f"Tests: {passed}/{total}" + ( |
| " | ALL PASSED β" if passed == total else f" | Fails: {'; '.join(fails[:2])}" |
| ) |
| return State(data=data, summary=summary) |
|
|
| def reset(self): |
| return State(data={"attempts": 0}) |
|
|
| def is_terminal(self, state): |
| return state.data.get("all_passed", False) |
|
|
|
|
| |
|
|
| TASKS = { |
| "fibonacci": { |
| "purpose": ( |
| "Write a Python function called fib(n) that returns the nth Fibonacci number. " |
| "fib(0)=0, fib(1)=1, fib(5)=5, fib(10)=55. " |
| "Use the submit_code action with your code in the 'code' parameter." |
| ), |
| "tests": [ |
| {"input": "fib(0)", "expected": "0"}, |
| {"input": "fib(1)", "expected": "1"}, |
| {"input": "fib(5)", "expected": "5"}, |
| {"input": "fib(10)", "expected": "55"}, |
| ], |
| }, |
| "fizzbuzz": { |
| "purpose": ( |
| "Write a Python function called fizzbuzz(n) that returns: " |
| "'Fizz' if n is divisible by 3, 'Buzz' if by 5, 'FizzBuzz' if by both, else str(n). " |
| "Use the submit_code action with your code in the 'code' parameter." |
| ), |
| "tests": [ |
| {"input": "fizzbuzz(3)", "expected": "Fizz"}, |
| {"input": "fizzbuzz(5)", "expected": "Buzz"}, |
| {"input": "fizzbuzz(15)", "expected": "FizzBuzz"}, |
| {"input": "fizzbuzz(7)", "expected": "7"}, |
| ], |
| }, |
| } |
|
|
|
|
| def run_task_with_real_model(task_name: str, orch: Orchestrator, run_num: int) -> dict: |
| """Run one task and return metrics.""" |
| task = TASKS[task_name] |
| env = CodeEnv(task["tests"]) |
| orch.environment = env |
|
|
| start = time.time() |
| try: |
| result = orch.run_task( |
| purpose=task["purpose"], |
| initial_state=env.reset(), |
| max_steps=3, |
| ) |
| phi = result.final_phi or 0 |
| steps = result.total_steps |
| pass_rate = result.final_state.data.get("pass_rate", 0) |
| all_passed = result.final_state.data.get("all_passed", False) |
| except Exception as e: |
| print(f" ERROR: {e}") |
| phi, steps, pass_rate, all_passed = 0, 0, 0, False |
|
|
| elapsed = time.time() - start |
| n_heur = len(orch.optimizer.heuristic_library) |
|
|
| status = "β" if all_passed else "β" |
| print(f" Run {run_num}: {status} Ξ¦={phi:.1f} pass={pass_rate:.0%} steps={steps} heur={n_heur} ({elapsed:.1f}s)") |
|
|
| return { |
| "run": run_num, "phi": round(phi, 1), "pass_rate": round(pass_rate, 2), |
| "steps": steps, "all_passed": all_passed, "heuristics": n_heur, |
| "time_s": round(elapsed, 1), |
| } |
|
|
|
|
| def main(): |
| print("ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ") |
| print("β Track 2: REAL MODEL Validation β") |
| print(f"β Model: {MODEL:<44} β") |
| print("ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ\n") |
|
|
| backend = resolve_backend(MODEL, api_key=GROQ_KEY) |
|
|
| |
| print("Testing connection...") |
| r = backend.generate( |
| [ChatMessage(role="user", content="Say 'ok' and nothing else.")], |
| temperature=0.1, max_tokens=500, |
| ) |
| print(f" Response: \"{r[:50]}\"") |
| print() |
|
|
| results = {} |
|
|
| for task_name in TASKS: |
| print(f"βββ {task_name} (3 runs, learning persists) βββ") |
|
|
| env = CodeEnv(TASKS[task_name]["tests"]) |
| orch = Orchestrator( |
| llm=backend, |
| environment=env, |
| available_actions={ |
| "submit_code": "Submit Python code. Put the code in the 'code' parameter.", |
| "DONE": "Signal task completion", |
| }, |
| optimize_every_n_tasks=1, |
| ) |
| orch.optimizer.min_reward_threshold = 0.1 |
|
|
| curve = [] |
| for run_num in range(1, 4): |
| entry = run_task_with_real_model(task_name, orch, run_num) |
| curve.append(entry) |
| time.sleep(1) |
|
|
| results[task_name] = curve |
|
|
| |
| if len(curve) >= 2: |
| delta = curve[-1]["phi"] - curve[0]["phi"] |
| if delta > 0: |
| print(f" β Ξ(Ξ¦) = {delta:+.1f} β IMPROVED") |
| elif delta == 0: |
| print(f" β Ξ(Ξ¦) = {delta:+.1f} (no change)") |
| else: |
| print(f" β Ξ(Ξ¦) = {delta:+.1f} (regressed)") |
| print() |
|
|
| |
| print("ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ") |
| print("β RESULTS β") |
| print("ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ") |
| print(f"{'Task':<14} {'Run 1 Ξ¦':>8} {'Run 3 Ξ¦':>8} {'Delta':>8} {'Verdict'}") |
| print("β" * 50) |
| for task_name, curve in results.items(): |
| r1 = curve[0]["phi"] |
| r3 = curve[-1]["phi"] |
| delta = r3 - r1 |
| verdict = "β IMPROVED" if delta > 0 else "= SAME" if delta == 0 else "β REGRESSED" |
| print(f"{task_name:<14} {r1:>8.1f} {r3:>8.1f} {delta:>+8.1f} {verdict}") |
|
|
| |
| os.makedirs("benchmarks/results", exist_ok=True) |
| with open("benchmarks/results/real_model_results.json", "w") as f: |
| json.dump({"model": MODEL, "results": results}, f, indent=2) |
| print(f"\nSaved to benchmarks/results/real_model_results.json") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|