sql_env / specs /F002-VERIFICATION_INPUT.json
hjerpe's picture
Upload folder using huggingface_hub
5dd1bb4 verified
{
"$schema": "autocode-verification-input-v1",
"feature_id": "F002",
"spec_path": "specs/F002-IMPLEMENTATION_SPEC.md",
"generated": "2026-03-27T12:00:00Z",
"verification_mode": "mvp",
"overview": {
"summary": "Type-aware answer verification for SQLEnv that replaces naive string comparison with dispatched comparers for integer (exact), float (1% tolerance), string (case-insensitive), and list (order-insensitive) answer types. Falls back to string comparison when answer_type is missing.",
"goal": "Ensure correct agent answers are not rejected due to trivial formatting, type coercion, or ordering differences."
},
"interfaces": {
"types": [
{
"name": "EpisodeContext",
"fields": [
{"name": "gold_rows", "type": "list[tuple] | None", "optional": true, "description": "Raw SQL result rows for accurate list comparison by verifier"}
],
"description": "Per-episode server-side state. Modified to add gold_rows field alongside existing gold_answer."
}
],
"functions": [
{
"name": "verify_answer",
"params": [
{"name": "predicted", "type": "str", "description": "Agent's submitted answer string"},
{"name": "gold", "type": "str", "description": "Gold answer as formatted string"},
{"name": "answer_type", "type": "str | None", "default": "None", "description": "One of 'integer', 'float', 'string', 'list', or None"},
{"name": "gold_rows", "type": "list[tuple] | None", "default": "None", "description": "Raw SQL result rows for list comparison"}
],
"returns": "bool",
"raises": [],
"description": "Compare agent answer against gold answer using type-specific comparison. Dispatches by answer_type; falls back to string comparison for None/unknown types."
},
{
"name": "_compare_integer",
"params": [
{"name": "predicted", "type": "str", "description": "Agent value"},
{"name": "gold", "type": "str", "description": "Gold value"}
],
"returns": "bool",
"description": "Exact integer match after coercing both sides via int(float(x)). Returns False on ValueError."
},
{
"name": "_compare_float",
"params": [
{"name": "predicted", "type": "str", "description": "Agent value"},
{"name": "gold", "type": "str", "description": "Gold value"},
{"name": "tolerance", "type": "float", "default": "0.01", "description": "Relative tolerance (1% default)"}
],
"returns": "bool",
"description": "Float comparison with relative tolerance. Uses abs(pred - gold) <= tolerance * abs(gold). For gold==0, uses absolute tolerance 1e-9."
},
{
"name": "_compare_string",
"params": [
{"name": "predicted", "type": "str", "description": "Agent value"},
{"name": "gold", "type": "str", "description": "Gold value"}
],
"returns": "bool",
"description": "Case-insensitive, whitespace-normalized string comparison."
},
{
"name": "_compare_list",
"params": [
{"name": "predicted", "type": "str", "description": "Agent value"},
{"name": "gold", "type": "str", "description": "Gold value as formatted string"},
{"name": "gold_rows", "type": "list[tuple] | None", "default": "None", "description": "Raw rows for accurate comparison"}
],
"returns": "bool",
"description": "Order-insensitive set comparison. Parses both sides into normalized string sets and compares equality."
}
],
"api_endpoints": []
},
"data_flow": {
"primary_flow": [
"Agent sends ANSWER action with value string",
"step() dispatches to _handle_answer(value)",
"_handle_answer() calls verify_answer(predicted, gold, answer_type, gold_rows)",
"verify_answer() dispatches to type-specific comparer based on answer_type",
"Comparer returns bool; _handle_answer returns (bool, float reward)"
],
"alternative_flows": [
{
"name": "Unknown or missing answer_type",
"trigger": "answer_type is None or not in known set",
"steps": [
"verify_answer receives answer_type=None",
"Falls back to _compare_string(predicted, gold)",
"Returns bool"
]
},
{
"name": "Type coercion failure",
"trigger": "predicted cannot be parsed as int or float",
"steps": [
"_compare_integer or _compare_float catches ValueError",
"Returns False (answer treated as incorrect)"
]
},
{
"name": "Empty or None input",
"trigger": "predicted is empty string after strip",
"steps": [
"verify_answer returns False immediately"
]
}
]
},
"error_handling": {
"error_types": [
{
"name": "ValueError",
"when": "Predicted value cannot be coerced to int/float during comparison"
},
{
"name": "RuntimeError",
"when": "_handle_answer called with no active episode (existing behavior, unchanged)"
}
],
"retry_strategy": null
},
"dependencies": {
"external": [],
"internal": [
{"name": "models.EpisodeContext", "usage": "gold_rows field added for verifier input"},
{"name": "models.QuestionRecord", "usage": "answer_type field read to determine comparison strategy"},
{"name": "server.sql_environment._handle_answer", "usage": "Modified to call verify_answer instead of inline comparison"}
]
}
}