sql_env / specs /F005-VERIFICATION_INPUT.json
hjerpe's picture
Upload folder using huggingface_hub
5dd1bb4 verified
{
"$schema": "autocode-verification-input-v1",
"feature_id": "F005",
"spec_path": "specs/F005-IMPLEMENTATION_SPEC.md",
"generated": "2026-03-27T12:00:00Z",
"verification_mode": "mvp",
"overview": {
"summary": "Automated evaluation wrapper that runs N episodes with a given policy against SQLEnvironment and returns structured metrics (success_rate, avg_reward, avg_steps). Includes a built-in RandomPolicy for instant baseline comparison. Results are collected incrementally so partial failures do not lose completed episode data.",
"goal": "Enable single-command evaluation: 'How does policy X perform over 100 episodes?' with structured output for training comparison (random vs trained)."
},
"interfaces": {
"types": [
{
"name": "Policy",
"description": "Protocol (structural subtype) for any evaluation policy. Any object with a matching select_action method satisfies this interface.",
"fields": [
{"name": "select_action", "type": "(observation: SQLObservation) -> SQLAction", "description": "Choose an action given the current observation"}
]
},
{
"name": "EpisodeResult",
"description": "Per-episode evaluation metrics. Frozen dataclass.",
"fields": [
{"name": "episode_index", "type": "int", "description": "0-based episode number"},
{"name": "correct", "type": "bool", "description": "Whether the ANSWER action matched the gold answer"},
{"name": "total_reward", "type": "float", "description": "Cumulative reward for the episode"},
{"name": "steps", "type": "int", "description": "Number of steps taken in the episode"},
{"name": "error", "type": "str | None", "optional": true, "description": "Error message if episode failed, None otherwise"}
]
},
{
"name": "EvaluationResult",
"description": "Aggregate evaluation metrics with per-episode breakdown. Frozen dataclass.",
"fields": [
{"name": "success_rate", "type": "float", "description": "Fraction of correct episodes in [0.0, 1.0]"},
{"name": "avg_reward", "type": "float", "description": "Mean total_reward across completed episodes"},
{"name": "avg_steps", "type": "float", "description": "Mean steps across completed episodes"},
{"name": "n_episodes", "type": "int", "description": "Total number of episodes attempted"},
{"name": "n_completed", "type": "int", "description": "Episodes that completed without error"},
{"name": "episodes", "type": "list[EpisodeResult]", "description": "Per-episode breakdown for analysis"}
]
}
],
"functions": [
{
"name": "RandomPolicy.__init__",
"params": [
{"name": "seed", "type": "int | None", "default": "None", "description": "Random seed for reproducibility"}
],
"returns": "None",
"description": "Initialize random baseline policy. Deterministic given a seed."
},
{
"name": "RandomPolicy.select_action",
"params": [
{"name": "observation", "type": "SQLObservation", "description": "Current environment observation"}
],
"returns": "SQLAction",
"description": "Pick a random action. If budget_remaining > 1: randomly choose DESCRIBE, SAMPLE, or QUERY. If budget_remaining == 1: ANSWER with a random guess."
},
{
"name": "evaluate",
"params": [
{"name": "env", "type": "SQLEnvironment", "description": "The environment to evaluate against"},
{"name": "policy", "type": "Policy", "description": "Any object satisfying the Policy protocol"},
{"name": "n_episodes", "type": "int", "default": "100", "description": "Number of episodes to run"},
{"name": "seed", "type": "int | None", "default": "None", "description": "Base seed for reproducibility; episode i uses seed+i"},
{"name": "progress_callback", "type": "Callable[[int, int], None] | None", "default": "None", "description": "Optional callback(current, total) for progress reporting"}
],
"returns": "EvaluationResult",
"raises": ["ValueError"],
"description": "Run automated evaluation of a policy over multiple episodes. Collects results incrementally -- failed episodes are recorded and evaluation continues."
}
],
"api_endpoints": []
},
"data_flow": {
"primary_flow": [
"evaluate() called with env, policy, n_episodes, optional seed",
"For each episode: env.reset(seed=base_seed+i) returns initial SQLObservation",
"Loop: policy.select_action(obs) -> SQLAction, then env.step(action) -> SQLObservation, accumulate reward",
"Episode ends when obs.done is True; record EpisodeResult with correct/reward/steps",
"Aggregate all EpisodeResults into EvaluationResult with success_rate, avg_reward, avg_steps"
],
"alternative_flows": [
{
"condition": "n_episodes is 0",
"steps": ["Return EvaluationResult with all zeros and empty episodes list"]
},
{
"condition": "Exception during episode (reset, select_action, or step fails)",
"steps": [
"Catch exception",
"Record EpisodeResult with correct=False, total_reward=0.0, steps=0, error=str(exc)",
"Continue to next episode"
]
}
]
},
"error_handling": {
"error_types": [
{
"name": "ValueError",
"when": "n_episodes < 0",
"handling": "Raise immediately before starting evaluation"
},
{
"name": "Exception (per-episode)",
"when": "Any exception during env.reset(), policy.select_action(), or env.step()",
"handling": "Catch, record as failed EpisodeResult with error field, continue to next episode"
}
],
"retry_strategy": null
},
"dependencies": {
"external": [],
"internal": [
{"name": "models.SQLAction", "usage": "Action type returned by policies"},
{"name": "models.SQLObservation", "usage": "Observation type passed to policies"},
{"name": "server.sql_environment.SQLEnvironment", "usage": "Environment with reset() and step() methods"}
]
}
}