| { |
| "$schema": "autocode-verification-input-v1", |
| "feature_id": "F005", |
| "spec_path": "specs/F005-IMPLEMENTATION_SPEC.md", |
| "generated": "2026-03-27T12:00:00Z", |
| "verification_mode": "mvp", |
|
|
| "overview": { |
| "summary": "Automated evaluation wrapper that runs N episodes with a given policy against SQLEnvironment and returns structured metrics (success_rate, avg_reward, avg_steps). Includes a built-in RandomPolicy for instant baseline comparison. Results are collected incrementally so partial failures do not lose completed episode data.", |
| "goal": "Enable single-command evaluation: 'How does policy X perform over 100 episodes?' with structured output for training comparison (random vs trained)." |
| }, |
|
|
| "interfaces": { |
| "types": [ |
| { |
| "name": "Policy", |
| "description": "Protocol (structural subtype) for any evaluation policy. Any object with a matching select_action method satisfies this interface.", |
| "fields": [ |
| {"name": "select_action", "type": "(observation: SQLObservation) -> SQLAction", "description": "Choose an action given the current observation"} |
| ] |
| }, |
| { |
| "name": "EpisodeResult", |
| "description": "Per-episode evaluation metrics. Frozen dataclass.", |
| "fields": [ |
| {"name": "episode_index", "type": "int", "description": "0-based episode number"}, |
| {"name": "correct", "type": "bool", "description": "Whether the ANSWER action matched the gold answer"}, |
| {"name": "total_reward", "type": "float", "description": "Cumulative reward for the episode"}, |
| {"name": "steps", "type": "int", "description": "Number of steps taken in the episode"}, |
| {"name": "error", "type": "str | None", "optional": true, "description": "Error message if episode failed, None otherwise"} |
| ] |
| }, |
| { |
| "name": "EvaluationResult", |
| "description": "Aggregate evaluation metrics with per-episode breakdown. Frozen dataclass.", |
| "fields": [ |
| {"name": "success_rate", "type": "float", "description": "Fraction of correct episodes in [0.0, 1.0]"}, |
| {"name": "avg_reward", "type": "float", "description": "Mean total_reward across completed episodes"}, |
| {"name": "avg_steps", "type": "float", "description": "Mean steps across completed episodes"}, |
| {"name": "n_episodes", "type": "int", "description": "Total number of episodes attempted"}, |
| {"name": "n_completed", "type": "int", "description": "Episodes that completed without error"}, |
| {"name": "episodes", "type": "list[EpisodeResult]", "description": "Per-episode breakdown for analysis"} |
| ] |
| } |
| ], |
| "functions": [ |
| { |
| "name": "RandomPolicy.__init__", |
| "params": [ |
| {"name": "seed", "type": "int | None", "default": "None", "description": "Random seed for reproducibility"} |
| ], |
| "returns": "None", |
| "description": "Initialize random baseline policy. Deterministic given a seed." |
| }, |
| { |
| "name": "RandomPolicy.select_action", |
| "params": [ |
| {"name": "observation", "type": "SQLObservation", "description": "Current environment observation"} |
| ], |
| "returns": "SQLAction", |
| "description": "Pick a random action. If budget_remaining > 1: randomly choose DESCRIBE, SAMPLE, or QUERY. If budget_remaining == 1: ANSWER with a random guess." |
| }, |
| { |
| "name": "evaluate", |
| "params": [ |
| {"name": "env", "type": "SQLEnvironment", "description": "The environment to evaluate against"}, |
| {"name": "policy", "type": "Policy", "description": "Any object satisfying the Policy protocol"}, |
| {"name": "n_episodes", "type": "int", "default": "100", "description": "Number of episodes to run"}, |
| {"name": "seed", "type": "int | None", "default": "None", "description": "Base seed for reproducibility; episode i uses seed+i"}, |
| {"name": "progress_callback", "type": "Callable[[int, int], None] | None", "default": "None", "description": "Optional callback(current, total) for progress reporting"} |
| ], |
| "returns": "EvaluationResult", |
| "raises": ["ValueError"], |
| "description": "Run automated evaluation of a policy over multiple episodes. Collects results incrementally -- failed episodes are recorded and evaluation continues." |
| } |
| ], |
| "api_endpoints": [] |
| }, |
|
|
| "data_flow": { |
| "primary_flow": [ |
| "evaluate() called with env, policy, n_episodes, optional seed", |
| "For each episode: env.reset(seed=base_seed+i) returns initial SQLObservation", |
| "Loop: policy.select_action(obs) -> SQLAction, then env.step(action) -> SQLObservation, accumulate reward", |
| "Episode ends when obs.done is True; record EpisodeResult with correct/reward/steps", |
| "Aggregate all EpisodeResults into EvaluationResult with success_rate, avg_reward, avg_steps" |
| ], |
| "alternative_flows": [ |
| { |
| "condition": "n_episodes is 0", |
| "steps": ["Return EvaluationResult with all zeros and empty episodes list"] |
| }, |
| { |
| "condition": "Exception during episode (reset, select_action, or step fails)", |
| "steps": [ |
| "Catch exception", |
| "Record EpisodeResult with correct=False, total_reward=0.0, steps=0, error=str(exc)", |
| "Continue to next episode" |
| ] |
| } |
| ] |
| }, |
|
|
| "error_handling": { |
| "error_types": [ |
| { |
| "name": "ValueError", |
| "when": "n_episodes < 0", |
| "handling": "Raise immediately before starting evaluation" |
| }, |
| { |
| "name": "Exception (per-episode)", |
| "when": "Any exception during env.reset(), policy.select_action(), or env.step()", |
| "handling": "Catch, record as failed EpisodeResult with error field, continue to next episode" |
| } |
| ], |
| "retry_strategy": null |
| }, |
|
|
| "dependencies": { |
| "external": [], |
| "internal": [ |
| {"name": "models.SQLAction", "usage": "Action type returned by policies"}, |
| {"name": "models.SQLObservation", "usage": "Observation type passed to policies"}, |
| {"name": "server.sql_environment.SQLEnvironment", "usage": "Environment with reset() and step() methods"} |
| ] |
| } |
| } |
|
|