sql_env / specs /F004-VERIFICATION_INPUT.json
hjerpe's picture
Upload folder using huggingface_hub
5dd1bb4 verified
{
"$schema": "autocode-verification-input-v1",
"feature_id": "F004",
"spec_path": "specs/F004-IMPLEMENTATION_SPEC.md",
"generated": "2026-03-24T12:00:00Z",
"verification_mode": "mvp",
"overview": {
"summary": "Expand the question dataset from 53 single-database questions to 100+ curated questions across 10 Spider databases. Each question is enriched with difficulty, answer_type, gold_answer, and tables_involved metadata. The dataset is split into train (70%) and eval (30%) partitions. A standalone curation script produces the output JSON files; SQLite database files are downloaded on-demand and gitignored.",
"goal": "Enable training on diverse databases and question types to prevent overfitting to one schema, with pre-computed gold answers to improve training throughput."
},
"interfaces": {
"types": [
{
"name": "EnrichedQuestionRecord",
"fields": [
{"name": "question_id", "type": "str", "description": "Unique ID in format {db_id}_{split}_{index:03d}"},
{"name": "question_text", "type": "str", "description": "Natural language question"},
{"name": "database_name", "type": "str", "description": "Spider db_id matching directory in data/databases/"},
{"name": "gold_sql", "type": "str", "description": "Reference SQL query"},
{"name": "gold_answer", "type": "Any", "description": "Pre-computed result of executing gold_sql"},
{"name": "answer_type", "type": "str", "description": "One of: integer, float, string, list, table"},
{"name": "difficulty", "type": "str", "description": "One of: easy, medium, hard"},
{"name": "tables_involved", "type": "list[str]", "description": "Table names referenced in gold_sql"},
{"name": "split", "type": "str", "description": "One of: train, eval"}
],
"description": "A single enriched question record in the output JSON files. Field names match QuestionRecord conceptual design in models.py."
}
],
"functions": [
{
"name": "download_spider_databases",
"params": [
{"name": "db_ids", "type": "list[str]", "description": "List of Spider database identifiers"},
{"name": "output_dir", "type": "Path", "description": "Base directory for database files"}
],
"returns": "dict[str, Path]",
"raises": ["FileNotFoundError"],
"description": "Download Spider SQLite database files for specified db_ids. Skips existing files."
},
{
"name": "load_spider_questions",
"params": [
{"name": "db_ids", "type": "list[str]", "description": "List of Spider database identifiers"}
],
"returns": "list[dict]",
"raises": ["ConnectionError"],
"description": "Load raw Spider questions from HuggingFace for specified databases, both train and validation splits."
},
{
"name": "compute_gold_answer",
"params": [
{"name": "gold_sql", "type": "str", "description": "Reference SQL query"},
{"name": "db_path", "type": "Path", "description": "Path to SQLite database file"}
],
"returns": "Any",
"raises": ["sqlite3.Error"],
"description": "Execute gold SQL against SQLite database and return the result."
},
{
"name": "classify_answer_type",
"params": [
{"name": "gold_answer", "type": "Any", "description": "Pre-computed answer value"}
],
"returns": "str",
"description": "Classify answer as integer, float, string, list, or table based on shape and type."
},
{
"name": "extract_tables_involved",
"params": [
{"name": "gold_sql", "type": "str", "description": "Reference SQL query"}
],
"returns": "list[str]",
"description": "Extract sorted unique table names from SQL query using regex parsing."
},
{
"name": "classify_difficulty",
"params": [
{"name": "tables_involved", "type": "list[str]", "description": "Tables referenced in query"}
],
"returns": "str",
"description": "Assign difficulty (easy/medium/hard) based on table count: 1-2=easy, 3=medium, 4+=hard."
},
{
"name": "assign_splits",
"params": [
{"name": "questions", "type": "list[dict]", "description": "Enriched questions with spider_split key"}
],
"returns": "list[dict]",
"description": "Assign train/eval splits based on Spider's own train/validation split."
},
{
"name": "validate_dataset",
"params": [
{"name": "questions", "type": "list[dict]", "description": "Full enriched dataset"},
{"name": "db_paths", "type": "dict[str, Path]", "description": "Mapping of db_id to SQLite path"}
],
"returns": "list[str]",
"raises": ["sqlite3.Error"],
"description": "Validate dataset: all fields present, gold_sql executes, gold_answer matches, no duplicate IDs, clean splits, difficulty distribution ~40/40/20."
}
],
"api_endpoints": []
},
"data_flow": {
"primary_flow": [
"Read db_list.json for target database IDs",
"Download Spider SQLite databases to data/databases/{db_id}/{db_id}.sqlite",
"Load raw Spider questions from HuggingFace for target db_ids (train + validation splits)",
"For each question: execute gold_sql against SQLite to compute gold_answer",
"Classify answer_type from gold_answer shape and type",
"Extract tables_involved from gold_sql via regex",
"Classify difficulty from tables_involved count",
"Assign train/eval split from Spider's own split",
"Generate question_id in format {db_id}_{split}_{index:03d}",
"Validate full dataset (fields, execution, deduplication, distribution)",
"Write questions_train.json and questions_eval.json"
],
"alternative_flows": [
{
"name": "Gold SQL execution failure",
"trigger": "gold_sql raises sqlite3.Error against its database",
"steps": [
"Log warning with db_id and error",
"Skip the question (exclude from dataset)",
"Continue processing remaining questions"
]
},
{
"name": "Validate-only mode",
"trigger": "Script invoked with --validate flag",
"steps": [
"Load existing questions_train.json and questions_eval.json",
"Locate SQLite databases in data/databases/",
"Run validate_dataset() on loaded data",
"Print validation results and exit with 0 (valid) or 1 (invalid)"
]
}
]
},
"error_handling": {
"error_types": [
{
"name": "FileNotFoundError",
"when": "SQLite database file cannot be downloaded for a given db_id",
"message_template": "Failed to download database: {db_id}"
},
{
"name": "sqlite3.OperationalError",
"when": "Gold SQL uses an unsupported SQLite feature",
"message_template": "SQL execution failed for {db_id}: {error}"
},
{
"name": "ConnectionError",
"when": "HuggingFace dataset download fails",
"message_template": "Failed to download Spider dataset: {error}"
},
{
"name": "ValidationError",
"when": "Dataset fails one or more validation checks",
"message_template": "Validation failed with {count} errors"
}
],
"retry_strategy": {
"enabled": true,
"max_attempts": 2,
"backoff": "linear"
}
},
"dependencies": {
"external": [
"datasets (HuggingFace)",
"sqlite3 (stdlib)"
],
"internal": [
"models.py (QuestionRecord conceptual design for field names)",
"data/questions/db_list.json (database configuration)"
]
}
}