| { |
| "$schema": "autocode-verification-input-v1", |
| "feature_id": "F004", |
| "spec_path": "specs/F004-IMPLEMENTATION_SPEC.md", |
| "generated": "2026-03-24T12:00:00Z", |
| "verification_mode": "mvp", |
|
|
| "overview": { |
| "summary": "Expand the question dataset from 53 single-database questions to 100+ curated questions across 10 Spider databases. Each question is enriched with difficulty, answer_type, gold_answer, and tables_involved metadata. The dataset is split into train (70%) and eval (30%) partitions. A standalone curation script produces the output JSON files; SQLite database files are downloaded on-demand and gitignored.", |
| "goal": "Enable training on diverse databases and question types to prevent overfitting to one schema, with pre-computed gold answers to improve training throughput." |
| }, |
|
|
| "interfaces": { |
| "types": [ |
| { |
| "name": "EnrichedQuestionRecord", |
| "fields": [ |
| {"name": "question_id", "type": "str", "description": "Unique ID in format {db_id}_{split}_{index:03d}"}, |
| {"name": "question_text", "type": "str", "description": "Natural language question"}, |
| {"name": "database_name", "type": "str", "description": "Spider db_id matching directory in data/databases/"}, |
| {"name": "gold_sql", "type": "str", "description": "Reference SQL query"}, |
| {"name": "gold_answer", "type": "Any", "description": "Pre-computed result of executing gold_sql"}, |
| {"name": "answer_type", "type": "str", "description": "One of: integer, float, string, list, table"}, |
| {"name": "difficulty", "type": "str", "description": "One of: easy, medium, hard"}, |
| {"name": "tables_involved", "type": "list[str]", "description": "Table names referenced in gold_sql"}, |
| {"name": "split", "type": "str", "description": "One of: train, eval"} |
| ], |
| "description": "A single enriched question record in the output JSON files. Field names match QuestionRecord conceptual design in models.py." |
| } |
| ], |
| "functions": [ |
| { |
| "name": "download_spider_databases", |
| "params": [ |
| {"name": "db_ids", "type": "list[str]", "description": "List of Spider database identifiers"}, |
| {"name": "output_dir", "type": "Path", "description": "Base directory for database files"} |
| ], |
| "returns": "dict[str, Path]", |
| "raises": ["FileNotFoundError"], |
| "description": "Download Spider SQLite database files for specified db_ids. Skips existing files." |
| }, |
| { |
| "name": "load_spider_questions", |
| "params": [ |
| {"name": "db_ids", "type": "list[str]", "description": "List of Spider database identifiers"} |
| ], |
| "returns": "list[dict]", |
| "raises": ["ConnectionError"], |
| "description": "Load raw Spider questions from HuggingFace for specified databases, both train and validation splits." |
| }, |
| { |
| "name": "compute_gold_answer", |
| "params": [ |
| {"name": "gold_sql", "type": "str", "description": "Reference SQL query"}, |
| {"name": "db_path", "type": "Path", "description": "Path to SQLite database file"} |
| ], |
| "returns": "Any", |
| "raises": ["sqlite3.Error"], |
| "description": "Execute gold SQL against SQLite database and return the result." |
| }, |
| { |
| "name": "classify_answer_type", |
| "params": [ |
| {"name": "gold_answer", "type": "Any", "description": "Pre-computed answer value"} |
| ], |
| "returns": "str", |
| "description": "Classify answer as integer, float, string, list, or table based on shape and type." |
| }, |
| { |
| "name": "extract_tables_involved", |
| "params": [ |
| {"name": "gold_sql", "type": "str", "description": "Reference SQL query"} |
| ], |
| "returns": "list[str]", |
| "description": "Extract sorted unique table names from SQL query using regex parsing." |
| }, |
| { |
| "name": "classify_difficulty", |
| "params": [ |
| {"name": "tables_involved", "type": "list[str]", "description": "Tables referenced in query"} |
| ], |
| "returns": "str", |
| "description": "Assign difficulty (easy/medium/hard) based on table count: 1-2=easy, 3=medium, 4+=hard." |
| }, |
| { |
| "name": "assign_splits", |
| "params": [ |
| {"name": "questions", "type": "list[dict]", "description": "Enriched questions with spider_split key"} |
| ], |
| "returns": "list[dict]", |
| "description": "Assign train/eval splits based on Spider's own train/validation split." |
| }, |
| { |
| "name": "validate_dataset", |
| "params": [ |
| {"name": "questions", "type": "list[dict]", "description": "Full enriched dataset"}, |
| {"name": "db_paths", "type": "dict[str, Path]", "description": "Mapping of db_id to SQLite path"} |
| ], |
| "returns": "list[str]", |
| "raises": ["sqlite3.Error"], |
| "description": "Validate dataset: all fields present, gold_sql executes, gold_answer matches, no duplicate IDs, clean splits, difficulty distribution ~40/40/20." |
| } |
| ], |
| "api_endpoints": [] |
| }, |
|
|
| "data_flow": { |
| "primary_flow": [ |
| "Read db_list.json for target database IDs", |
| "Download Spider SQLite databases to data/databases/{db_id}/{db_id}.sqlite", |
| "Load raw Spider questions from HuggingFace for target db_ids (train + validation splits)", |
| "For each question: execute gold_sql against SQLite to compute gold_answer", |
| "Classify answer_type from gold_answer shape and type", |
| "Extract tables_involved from gold_sql via regex", |
| "Classify difficulty from tables_involved count", |
| "Assign train/eval split from Spider's own split", |
| "Generate question_id in format {db_id}_{split}_{index:03d}", |
| "Validate full dataset (fields, execution, deduplication, distribution)", |
| "Write questions_train.json and questions_eval.json" |
| ], |
| "alternative_flows": [ |
| { |
| "name": "Gold SQL execution failure", |
| "trigger": "gold_sql raises sqlite3.Error against its database", |
| "steps": [ |
| "Log warning with db_id and error", |
| "Skip the question (exclude from dataset)", |
| "Continue processing remaining questions" |
| ] |
| }, |
| { |
| "name": "Validate-only mode", |
| "trigger": "Script invoked with --validate flag", |
| "steps": [ |
| "Load existing questions_train.json and questions_eval.json", |
| "Locate SQLite databases in data/databases/", |
| "Run validate_dataset() on loaded data", |
| "Print validation results and exit with 0 (valid) or 1 (invalid)" |
| ] |
| } |
| ] |
| }, |
|
|
| "error_handling": { |
| "error_types": [ |
| { |
| "name": "FileNotFoundError", |
| "when": "SQLite database file cannot be downloaded for a given db_id", |
| "message_template": "Failed to download database: {db_id}" |
| }, |
| { |
| "name": "sqlite3.OperationalError", |
| "when": "Gold SQL uses an unsupported SQLite feature", |
| "message_template": "SQL execution failed for {db_id}: {error}" |
| }, |
| { |
| "name": "ConnectionError", |
| "when": "HuggingFace dataset download fails", |
| "message_template": "Failed to download Spider dataset: {error}" |
| }, |
| { |
| "name": "ValidationError", |
| "when": "Dataset fails one or more validation checks", |
| "message_template": "Validation failed with {count} errors" |
| } |
| ], |
| "retry_strategy": { |
| "enabled": true, |
| "max_attempts": 2, |
| "backoff": "linear" |
| } |
| }, |
|
|
| "dependencies": { |
| "external": [ |
| "datasets (HuggingFace)", |
| "sqlite3 (stdlib)" |
| ], |
| "internal": [ |
| "models.py (QuestionRecord conceptual design for field names)", |
| "data/questions/db_list.json (database configuration)" |
| ] |
| } |
| } |
|
|