Spaces:

hjerpe
/

sql_env

Running

App Files Files Community

sql_env / specs /F004-VERIFICATION_INPUT.json

hjerpe

Upload folder using huggingface_hub

5dd1bb4 verified 22 days ago

raw

history blame contribute delete

7.89 kB

	{
	"$schema": "autocode-verification-input-v1",
	"feature_id": "F004",
	"spec_path": "specs/F004-IMPLEMENTATION_SPEC.md",
	"generated": "2026-03-24T12:00:00Z",
	"verification_mode": "mvp",

	"overview": {
	"summary": "Expand the question dataset from 53 single-database questions to 100+ curated questions across 10 Spider databases. Each question is enriched with difficulty, answer_type, gold_answer, and tables_involved metadata. The dataset is split into train (70%) and eval (30%) partitions. A standalone curation script produces the output JSON files; SQLite database files are downloaded on-demand and gitignored.",
	"goal": "Enable training on diverse databases and question types to prevent overfitting to one schema, with pre-computed gold answers to improve training throughput."
	},

	"interfaces": {
	"types": [
	{
	"name": "EnrichedQuestionRecord",
	"fields": [
	{"name": "question_id", "type": "str", "description": "Unique ID in format {db_id}_{split}_{index:03d}"},
	{"name": "question_text", "type": "str", "description": "Natural language question"},
	{"name": "database_name", "type": "str", "description": "Spider db_id matching directory in data/databases/"},
	{"name": "gold_sql", "type": "str", "description": "Reference SQL query"},
	{"name": "gold_answer", "type": "Any", "description": "Pre-computed result of executing gold_sql"},
	{"name": "answer_type", "type": "str", "description": "One of: integer, float, string, list, table"},
	{"name": "difficulty", "type": "str", "description": "One of: easy, medium, hard"},
	{"name": "tables_involved", "type": "list[str]", "description": "Table names referenced in gold_sql"},
	{"name": "split", "type": "str", "description": "One of: train, eval"}
	],
	"description": "A single enriched question record in the output JSON files. Field names match QuestionRecord conceptual design in models.py."
	}
	],
	"functions": [
	{
	"name": "download_spider_databases",
	"params": [
	{"name": "db_ids", "type": "list[str]", "description": "List of Spider database identifiers"},
	{"name": "output_dir", "type": "Path", "description": "Base directory for database files"}
	],
	"returns": "dict[str, Path]",
	"raises": ["FileNotFoundError"],
	"description": "Download Spider SQLite database files for specified db_ids. Skips existing files."
	},
	{
	"name": "load_spider_questions",
	"params": [
	{"name": "db_ids", "type": "list[str]", "description": "List of Spider database identifiers"}
	],
	"returns": "list[dict]",
	"raises": ["ConnectionError"],
	"description": "Load raw Spider questions from HuggingFace for specified databases, both train and validation splits."
	},
	{
	"name": "compute_gold_answer",
	"params": [
	{"name": "gold_sql", "type": "str", "description": "Reference SQL query"},
	{"name": "db_path", "type": "Path", "description": "Path to SQLite database file"}
	],
	"returns": "Any",
	"raises": ["sqlite3.Error"],
	"description": "Execute gold SQL against SQLite database and return the result."
	},
	{
	"name": "classify_answer_type",
	"params": [
	{"name": "gold_answer", "type": "Any", "description": "Pre-computed answer value"}
	],
	"returns": "str",
	"description": "Classify answer as integer, float, string, list, or table based on shape and type."
	},
	{
	"name": "extract_tables_involved",
	"params": [
	{"name": "gold_sql", "type": "str", "description": "Reference SQL query"}
	],
	"returns": "list[str]",
	"description": "Extract sorted unique table names from SQL query using regex parsing."
	},
	{
	"name": "classify_difficulty",
	"params": [
	{"name": "tables_involved", "type": "list[str]", "description": "Tables referenced in query"}
	],
	"returns": "str",
	"description": "Assign difficulty (easy/medium/hard) based on table count: 1-2=easy, 3=medium, 4+=hard."
	},
	{
	"name": "assign_splits",
	"params": [
	{"name": "questions", "type": "list[dict]", "description": "Enriched questions with spider_split key"}
	],
	"returns": "list[dict]",
	"description": "Assign train/eval splits based on Spider's own train/validation split."
	},
	{
	"name": "validate_dataset",
	"params": [
	{"name": "questions", "type": "list[dict]", "description": "Full enriched dataset"},
	{"name": "db_paths", "type": "dict[str, Path]", "description": "Mapping of db_id to SQLite path"}
	],
	"returns": "list[str]",
	"raises": ["sqlite3.Error"],
	"description": "Validate dataset: all fields present, gold_sql executes, gold_answer matches, no duplicate IDs, clean splits, difficulty distribution ~40/40/20."
	}
	],
	"api_endpoints": []
	},

	"data_flow": {
	"primary_flow": [
	"Read db_list.json for target database IDs",
	"Download Spider SQLite databases to data/databases/{db_id}/{db_id}.sqlite",
	"Load raw Spider questions from HuggingFace for target db_ids (train + validation splits)",
	"For each question: execute gold_sql against SQLite to compute gold_answer",
	"Classify answer_type from gold_answer shape and type",
	"Extract tables_involved from gold_sql via regex",
	"Classify difficulty from tables_involved count",
	"Assign train/eval split from Spider's own split",
	"Generate question_id in format {db_id}_{split}_{index:03d}",
	"Validate full dataset (fields, execution, deduplication, distribution)",
	"Write questions_train.json and questions_eval.json"
	],
	"alternative_flows": [
	{
	"name": "Gold SQL execution failure",
	"trigger": "gold_sql raises sqlite3.Error against its database",
	"steps": [
	"Log warning with db_id and error",
	"Skip the question (exclude from dataset)",
	"Continue processing remaining questions"
	]
	},
	{
	"name": "Validate-only mode",
	"trigger": "Script invoked with --validate flag",
	"steps": [
	"Load existing questions_train.json and questions_eval.json",
	"Locate SQLite databases in data/databases/",
	"Run validate_dataset() on loaded data",
	"Print validation results and exit with 0 (valid) or 1 (invalid)"
	]
	}
	]
	},

	"error_handling": {
	"error_types": [
	{
	"name": "FileNotFoundError",
	"when": "SQLite database file cannot be downloaded for a given db_id",
	"message_template": "Failed to download database: {db_id}"
	},
	{
	"name": "sqlite3.OperationalError",
	"when": "Gold SQL uses an unsupported SQLite feature",
	"message_template": "SQL execution failed for {db_id}: {error}"
	},
	{
	"name": "ConnectionError",
	"when": "HuggingFace dataset download fails",
	"message_template": "Failed to download Spider dataset: {error}"
	},
	{
	"name": "ValidationError",
	"when": "Dataset fails one or more validation checks",
	"message_template": "Validation failed with {count} errors"
	}
	],
	"retry_strategy": {
	"enabled": true,
	"max_attempts": 2,
	"backoff": "linear"
	}
	},

	"dependencies": {
	"external": [
	"datasets (HuggingFace)",
	"sqlite3 (stdlib)"
	],
	"internal": [
	"models.py (QuestionRecord conceptual design for field names)",
	"data/questions/db_list.json (database configuration)"
	]
	}
	}