Spaces:

hjerpe
/

sql_env

Sleeping

App Files Files Community

sql_env / specs /FEATURES.json

hjerpe

Upload folder using huggingface_hub

9e64e71 verified 13 days ago

raw

history blame contribute delete

60.2 kB

	{
	"$schema": "./schemas/autocode-features-v1.schema.json",
	"project": "SQLEnv - Interactive Database Query RL Environment",
	"description": "OpenEnv Challenge submission: RL environment where agents learn to answer NL questions about databases through iterative SQL exploration",
	"created": "2026-03-24T07:15:50Z",
	"updated": "2026-04-11T15:55:16Z",
	"features": [
	{
	"id": "F001",
	"name": "Core Environment Loop",
	"description": "Complete the step/reset lifecycle: remove Ollama from environment, accept structured actions (DESCRIBE table_name, SAMPLE table_name, QUERY sql_string, ANSWER value), wire up SQLite execution with sandboxing (read-only, 5s timeout, SELECT-only), load questions from JSON on reset(), enforce step budget (15 steps), handle episode termination",
	"complexity": "complex",
	"verification_mode": "standard",
	"status": "complete",
	"priority": 1,
	"dependencies": [],
	"docs": {
	"discovery_json": null,
	"discovery_md": null,
	"design_doc": null,
	"delivery_spec": null
	},
	"taste": {
	"source": "user_interview",
	"notes": "Derived from docs_draft/sql_env_project_brief.md and docs_draft/SQLEnv_Concept_v1.md — the v1 spec defines the action space, episode lifecycle, and sandboxing requirements"
	},
	"user_interview": {
	"conducted": "2026-03-24T09:00:00Z",
	"skipped": false,
	"skip_reason": null,
	"value": {
	"question": "What will users be able to do that they couldn't before?",
	"response": "Agents can play complete episodes: reset with a random question, explore a hidden schema via DESCRIBE/SAMPLE, run SQL queries, and submit answers. Currently SQL never executes — this makes the environment actually functional."
	},
	"experience": {
	"question": "Walk me through using this. What would delight you? What would frustrate you?",
	"delights": [
	"Agent sends DESCRIBE employees and immediately sees column names and types",
	"Queries execute in <100ms with clean truncated output (max 20 rows)",
	"Bad SQL returns a clear error message the agent can learn from",
	"Episode ends cleanly when budget exhausted or ANSWER submitted"
	],
	"frustrations": [
	"Environment calling Ollama to interpret actions (current design) — agent should own reasoning, env should just execute",
	"Queries hanging or crashing the environment",
	"Opaque error messages that don't help the agent adjust"
	]
	},
	"maturity": {
	"question": "Is this exploratory, MVP, or production?",
	"response": "mvp",
	"rationale": "Competition submission — needs to work reliably for demo and training, not at production scale"
	}
	},
	"progress": {
	"implementation_steps": {
	"total": 8,
	"completed": 8
	},
	"verification_tests": {
	"total": 86,
	"passed": 25
	}
	},
	"specs": {
	"implementation": "specs/F001-IMPLEMENTATION_SPEC.md",
	"verification": "specs/F001-VERIFICATION_SPEC.md"
	},
	"timestamps": {
	"planned": "2026-03-24T10:30:00Z",
	"verification_planned": "2026-03-24T10:30:00Z",
	"started": "2026-03-24T19:22:08Z",
	"completed": "2026-03-24T21:27:31Z"
	},
	"verification_evidence": {
	"mode": "standard",
	"tests_run": 25,
	"tests_passed": 25,
	"timestamp": "2026-03-24T21:27:31Z",
	"command": "uv run pytest tests/ -v",
	"verifier_result": "approved"
	},
	"demo": {
	"path": "specs/F001-DEMO.md",
	"generated_at": "2026-03-24T21:36:32Z",
	"mode": "local_cli",
	"status": "partial",
	"requires_user_verification": true,
	"verification_surfaces": [
	"local_server_startup",
	"data_provisioning",
	"api_episode_flow"
	],
	"evidence_refs": [
	"specs/F001-VERIFICATION_SPEC.md",
	"specs/F001-DEMO.md"
	],
	"note": "Local server and tests verified; end-to-end API episode flow requires local Spider DB provisioning."
	},
	"user_value": "Agents can now run complete SQL exploration episodes end-to-end with structured DESCRIBE/SAMPLE/QUERY/ANSWER actions, live read-only SQLite execution, clear error feedback, and clean terminal completion on ANSWER or budget exhaustion."
	},
	{
	"id": "F002",
	"name": "Answer Verification",
	"description": "Multi-type answer comparison: integer (exact match), float (1% tolerance), string (case-insensitive normalized), list (order-insensitive set comparison). Implements verify_answer() in server/verifier.py. Returns binary correctness for terminal reward.",
	"complexity": "standard",
	"verification_mode": "standard",
	"status": "complete",
	"priority": 2,
	"dependencies": [
	"F001"
	],
	"docs": {
	"discovery_json": null,
	"discovery_md": null,
	"design_doc": null,
	"delivery_spec": null
	},
	"taste": {
	"source": "user_interview",
	"notes": "Answer type handling defined in docs_draft/SQLEnv_Concept_v1.md Section 4.2"
	},
	"user_interview": {
	"conducted": "2026-03-24T09:00:00Z",
	"skipped": false,
	"skip_reason": null,
	"value": {
	"question": "What will users be able to do that they couldn't before?",
	"response": "When an agent submits ANSWER, the environment correctly determines if the answer matches the gold answer regardless of type (42 vs 42.0, 'Engineering' vs 'engineering', unordered lists)."
	},
	"experience": {
	"question": "Walk me through using this. What would delight you? What would frustrate you?",
	"delights": [
	"Float comparison with tolerance handles rounding gracefully (95000.1 matches 95000)",
	"List comparison ignores order: ['A','B'] matches ['B','A']",
	"Clear pass/fail with no ambiguity"
	],
	"frustrations": [
	"Correct answer rejected due to trivial formatting difference",
	"Type coercion failures (agent says '42', gold is integer 42)"
	]
	},
	"maturity": {
	"question": "Is this exploratory, MVP, or production?",
	"response": "mvp",
	"rationale": "Must handle the 4 core answer types reliably. Table comparison can come later."
	}
	},
	"progress": {
	"implementation_steps": {
	"total": 4,
	"completed": 4
	},
	"verification_tests": {
	"total": 65,
	"passed": 65
	}
	},
	"specs": {
	"implementation": "specs/F002-IMPLEMENTATION_SPEC.md",
	"verification": "specs/F002-VERIFICATION_SPEC.md"
	},
	"timestamps": {
	"planned": "2026-03-27T12:00:00Z",
	"verification_planned": "2026-03-27T12:00:00Z",
	"started": "2026-03-27T22:18:15Z",
	"completed": "2026-03-27T22:33:12Z"
	},
	"verification_evidence": {
	"mode": "standard",
	"tests_run": 65,
	"tests_passed": 65,
	"timestamp": "2026-03-27T22:33:12Z",
	"command": "uv run pytest tests/ -v",
	"verifier_result": "approved"
	},
	"demo": {
	"path": "specs/F002-DEMO.md",
	"generated_at": "2026-03-27T22:37:50Z",
	"mode": "artifact_build",
	"status": "partial",
	"requires_user_verification": true,
	"verification_surfaces": [
	"local_pytest_verification",
	"runtime_episode_scoring"
	],
	"evidence_refs": [
	"specs/F002-VERIFICATION_SPEC.md",
	"specs/F002-DEMO.md"
	],
	"note": "Strongest local proof is targeted and integration pytest evidence; final runtime confirmation remains a user-operated episode check."
	},
	"user_value": "Agents can now submit ANSWER values across integer, float, string, and list questions and receive correct terminal scoring despite formatting differences, numeric representation differences, and list order changes."
	},
	{
	"id": "F003",
	"name": "Dense Reward System",
	"description": "3-layer reward architecture: Layer 1 (operational validity: exec_ok +0.02, new_info +0.01 capped at 0.10, repeat -0.01, step_cost -0.005), Layer 2 (progress-to-target: weighted average of cardinality matching + value overlap + numeric range proximity, binned to 5 levels, improvement-only), Layer 3 (terminal correctness: +1.0 or 0.0). Total step rewards capped at 0.5, negative floor at -0.2.",
	"complexity": "complex",
	"verification_mode": "standard",
	"status": "complete",
	"priority": 3,
	"dependencies": [
	"F001",
	"F002"
	],
	"docs": {
	"discovery_json": null,
	"discovery_md": null,
	"design_doc": null,
	"delivery_spec": null
	},
	"taste": {
	"source": "user_interview",
	"notes": "Reward architecture defined in docs_draft/SQLEnv_Concept_v1.md Section 3 and docs_draft/reward-research_gpt-5-2.md. Distance metrics detailed in docs_draft/reward_design.md."
	},
	"user_interview": {
	"conducted": "2026-03-24T09:00:00Z",
	"skipped": false,
	"skip_reason": null,
	"value": {
	"question": "What will users be able to do that they couldn't before?",
	"response": "Agents get meaningful feedback during exploration — not just 0/1 at the end. A query that returns 40 when the answer is 42 gets partial credit. Discovering new schema info gets a small reward. This makes GRPO training converge."
	},
	"experience": {
	"question": "Walk me through using this. What would delight you? What would frustrate you?",
	"delights": [
	"Reward varies meaningfully: random exploration ~0.1, targeted queries ~0.3, correct answer ~1.3",
	"Anti-gaming works: agent can't farm rewards by describing everything or repeating queries",
	"Progress signal is coarsened to prevent reward hill-climbing"
	],
	"frustrations": [
	"Reward hacking: agent learns to exploit shaping rather than solve the task",
	"Reward too sparse: agent gets no signal until terminal step",
	"Over-complex reward that's hard to debug"
	]
	},
	"maturity": {
	"question": "Is this exploratory, MVP, or production?",
	"response": "mvp",
	"rationale": "Start with weighted average of 3 metrics (cardinality, value overlap, numeric range). Add complexity only if training shows issues."
	}
	},
	"progress": {
	"implementation_steps": {
	"total": 7,
	"completed": 7
	},
	"verification_tests": {
	"total": 61,
	"passed": 166
	}
	},
	"specs": {
	"implementation": "specs/F003-IMPLEMENTATION_SPEC.md",
	"verification": "specs/F003-VERIFICATION_SPEC.md"
	},
	"timestamps": {
	"planned": "2026-03-27T12:00:00Z",
	"verification_planned": "2026-03-27T12:00:00Z",
	"started": "2026-03-27T23:51:47Z",
	"completed": "2026-03-28T06:05:02Z"
	},
	"verification_evidence": {
	"mode": "standard",
	"tests_run": 166,
	"tests_passed": 166,
	"timestamp": "2026-03-28T06:05:02Z",
	"command": "uv run --with pytest pytest tests/ -v",
	"verifier_result": "approved"
	},
	"demo": {
	"path": "specs/F003-DEMO.md",
	"generated_at": "2026-03-28T06:07:34Z",
	"mode": "artifact_build",
	"status": "generated",
	"requires_user_verification": true,
	"verification_surfaces": [
	"local_pytest_verification",
	"runtime_episode_flow"
	],
	"evidence_refs": [
	"specs/F003-VERIFICATION_SPEC.md",
	"specs/F003-DEMO.md"
	],
	"note": "Strongest local proof is targeted smoke/unit execution; full reward calibration and live episode behavior should be confirmed in a user-run episode/training context."
	},
	"user_value": "Agents now receive dense numeric rewards on every non-terminal DESCRIBE/SAMPLE/QUERY step based on execution quality and progress toward the gold answer, while terminal correctness still dominates total episode reward."
	},
	{
	"id": "F004",
	"name": "Question Dataset Expansion",
	"description": "Expand from 53 questions (one DB) to 100+ questions across 5-10 Spider databases. Add difficulty labels (easy/medium/hard at 40/40/20 split), answer_type metadata, and gold_answer fields. Create train/eval split (70/30). Curate for diversity of answer types and SQL patterns.",
	"complexity": "standard",
	"verification_mode": "mvp",
	"status": "complete",
	"priority": 4,
	"dependencies": [],
	"docs": {
	"discovery_json": null,
	"discovery_md": null,
	"design_doc": null,
	"delivery_spec": null
	},
	"taste": {
	"source": "user_interview",
	"notes": "Dataset requirements from docs_draft/sql_env_project_brief.md Section 3 and SQLEnv_Concept_v1.md Section 4"
	},
	"user_interview": {
	"conducted": "2026-03-24T09:00:00Z",
	"skipped": false,
	"skip_reason": null,
	"value": {
	"question": "What will users be able to do that they couldn't before?",
	"response": "Training on diverse databases and question types. Current single-DB setup risks overfitting to one schema."
	},
	"experience": {
	"question": "Walk me through using this. What would delight you? What would frustrate you?",
	"delights": [
	"Clear difficulty progression: easy questions have 1-2 tables, hard ones have 5+",
	"Each question has pre-computed gold_answer so reward doesn't need to re-execute gold SQL every episode",
	"Train/eval split prevents training on evaluation data"
	],
	"frustrations": [
	"Questions that require SQL features SQLite doesn't support",
	"Ambiguous gold answers (multiple valid interpretations)",
	"All questions from same domain = no generalization"
	]
	},
	"maturity": {
	"question": "Is this exploratory, MVP, or production?",
	"response": "mvp",
	"rationale": "100 well-curated questions is sufficient for competition demo. Quality over quantity."
	}
	},
	"progress": {
	"implementation_steps": {
	"total": 6,
	"completed": 6
	},
	"verification_tests": {
	"total": 66,
	"passed": 21
	}
	},
	"specs": {
	"implementation": "specs/F004-IMPLEMENTATION_SPEC.md",
	"verification": "specs/F004-VERIFICATION_SPEC.md"
	},
	"demo": {
	"path": "specs/F004-DEMO.md",
	"generated_at": "2026-03-24T21:07:31Z"
	},
	"timestamps": {
	"planned": "2026-03-24T10:30:00Z",
	"verification_planned": "2026-03-24T10:30:00Z",
	"started": "2026-03-24T16:53:35Z",
	"completed": "2026-03-24T21:04:54Z"
	},
	"verification_evidence": {
	"mode": "mvp",
	"tests_run": 21,
	"tests_passed": 21,
	"timestamp": "2026-03-24T21:04:54Z",
	"command": "uv run pytest tests/ -v",
	"verifier_result": "approved"
	},
	"user_value": "Users can now train and evaluate against a curated multi-database dataset (676 questions across 10 Spider databases) with precomputed gold answers, answer types, difficulty labels, and deterministic train/eval splits."
	},
	{
	"id": "F005",
	"name": "Green Agent Wrapper",
	"description": "Automated evaluation wrapper following OpenEnv pattern. Runs N episodes with a given policy (random, heuristic, or trained model). Reports success_rate, avg_reward, avg_steps. Supports random baseline policy for comparison. Required by competition evaluation criteria.",
	"complexity": "standard",
	"verification_mode": "mvp",
	"status": "complete",
	"priority": 5,
	"dependencies": [
	"F001",
	"F002"
	],
	"docs": {
	"discovery_json": null,
	"discovery_md": null,
	"design_doc": null,
	"delivery_spec": null
	},
	"taste": {
	"source": "user_interview",
	"notes": "Green Agent pattern from SQLEnv_Concept_v1.md Appendix C. Required by OpenEnv Challenge evaluation criteria."
	},
	"user_interview": {
	"conducted": "2026-03-24T09:00:00Z",
	"skipped": false,
	"skip_reason": null,
	"value": {
	"question": "What will users be able to do that they couldn't before?",
	"response": "Run automated evaluation: 'How does policy X perform over 100 episodes?' Single command, structured output. Enables training comparison (random vs trained)."
	},
	"experience": {
	"question": "Walk me through using this. What would delight you? What would frustrate you?",
	"delights": [
	"Single function call: evaluate(n_episodes=100) returns clean metrics dict",
	"Built-in random policy for instant baseline comparison",
	"Results include per-episode breakdown for analysis"
	],
	"frustrations": [
	"Evaluation crashes partway through and loses all results",
	"No progress indicator for long evaluation runs"
	]
	},
	"maturity": {
	"question": "Is this exploratory, MVP, or production?",
	"response": "mvp",
	"rationale": "Needs to produce reliable metrics for blog post. Doesn't need fancy visualization."
	}
	},
	"progress": {
	"implementation_steps": {
	"total": 4,
	"completed": 4
	},
	"verification_tests": {
	"total": 43,
	"passed": 16
	}
	},
	"specs": {
	"implementation": "specs/F005-IMPLEMENTATION_SPEC.md",
	"verification": "specs/F005-VERIFICATION_SPEC.md"
	},
	"timestamps": {
	"planned": "2026-03-27T12:00:00Z",
	"verification_planned": "2026-03-27T12:00:00Z",
	"started": "2026-03-27T23:51:09Z",
	"completed": "2026-03-28T00:04:03Z"
	},
	"verification_evidence": {
	"mode": "mvp",
	"tests_run": 16,
	"tests_passed": 16,
	"timestamp": "2026-03-28T00:04:03Z",
	"command": "uv run --with pytest pytest tests/test_evaluation.py -v",
	"verifier_result": "approved"
	},
	"demo": {
	"path": "specs/F005-DEMO.md",
	"generated_at": "2026-03-28T00:10:42Z",
	"mode": "local_cli",
	"status": "generated",
	"requires_user_verification": false,
	"verification_surfaces": [
	"local_python_api",
	"local_pytest"
	],
	"evidence_refs": [
	"specs/F005-VERIFICATION_SPEC.md",
	"specs/F005-IMPLEMENTATION_SPEC.md",
	"specs/F005-DEMO.md"
	],
	"note": "Demo includes direct public API invocation plus local integration, determinism, edge, and progress-callback evidence."
	},
	"user_value": "Users can now evaluate any SQLEnv policy over multiple episodes with one call, get structured aggregate metrics plus per-episode results, and rely on deterministic seeded runs for fair baseline comparisons."
	},
	{
	"id": "F006",
	"name": "GRPO Training Pipeline",
	"description": "TRL/GRPO integration for training a small LLM (Qwen3-1.7B or similar) to play SQLEnv. Includes: system prompt design for SQL exploration strategy, rollout_func that plays episodes via WebSocket client, reward_funcs (correctness, progress, operational) for GRPOTrainer, training notebook with hyperparameter config, baseline vs trained comparison output.",
	"complexity": "complex",
	"verification_mode": "mvp",
	"status": "complete",
	"priority": 6,
	"dependencies": [
	"F003",
	"F005"
	],
	"docs": {
	"discovery_json": null,
	"discovery_md": null,
	"design_doc": null,
	"delivery_spec": null
	},
	"taste": {
	"source": "user_interview",
	"notes": "Training pipeline from docs_draft/SQLEnv_Concept_v1.md Section 3.5 (TRL mapping) and docs_draft/sql_env_project_brief.md Phase 4"
	},
	"user_interview": {
	"conducted": "2026-03-24T09:00:00Z",
	"skipped": false,
	"skip_reason": null,
	"value": {
	"question": "What will users be able to do that they couldn't before?",
	"response": "Train a model that learns SQL exploration strategy through RL. The 'before vs after' comparison is the competition's money shot — untrained agent flails randomly, trained agent explores strategically."
	},
	"experience": {
	"question": "Walk me through using this. What would delight you? What would frustrate you?",
	"delights": [
	"Training notebook runs end-to-end in one click",
	"Learning curve clearly shows improvement over episodes",
	"Side-by-side episode transcripts: random vs trained",
	"Reproducible results"
	],
	"frustrations": [
	"Training doesn't converge at all",
	"Need expensive GPU for hours to see any signal",
	"Notebook has hidden dependencies that break on fresh setup"
	]
	},
	"maturity": {
	"question": "Is this exploratory, MVP, or production?",
	"response": "mvp",
	"rationale": "Even modest improvement over random is a win. The environment design + reward architecture is the main innovation, not SOTA training results."
	}
	},
	"progress": {
	"implementation_steps": {
	"total": 6,
	"completed": 6
	},
	"verification_tests": {
	"total": 68,
	"passed": 68
	}
	},
	"specs": {
	"implementation": "specs/F006-IMPLEMENTATION_SPEC.md",
	"verification": "specs/F006-VERIFICATION_SPEC.md"
	},
	"timestamps": {
	"planned": "2026-03-27T12:00:00Z",
	"verification_planned": "2026-03-27T12:00:00Z",
	"started": "2026-03-28T06:44:31Z",
	"completed": "2026-03-28T07:37:20Z"
	},
	"verification_evidence": {
	"mode": "mvp",
	"tests_run": 68,
	"tests_passed": 68,
	"timestamp": "2026-03-28T07:37:20Z",
	"command": "uv run --with pytest pytest tests/unit/test_grpo_config.py tests/unit/test_prompts.py tests/unit/test_rollout.py tests/unit/test_rewards.py tests/unit/test_error_handling.py tests/integration/test_training_pipeline.py tests/e2e/test_training_e2e.py -v",
	"verifier_result": "approved"
	},
	"user_value": "Users can now run a single GRPO notebook workflow that loads training prompts, trains an SQLEnv policy with TRL, visualizes reward-curve progress, and compares random-baseline transcripts against trained-policy transcripts before saving artifacts.",
	"demo": {
	"path": "specs/F006-DEMO.md",
	"generated_at": "2026-03-28T07:42:55Z",
	"mode": "interactive_ui",
	"status": "partial",
	"requires_user_verification": true,
	"verification_surfaces": [
	"local_dependency_import",
	"local_pytest_verification",
	"jupyter_notebook_launch",
	"interactive_notebook_run"
	],
	"evidence_refs": [
	"specs/F006-VERIFICATION_SPEC.md",
	"specs/F006-DEMO.md"
	],
	"note": "Local proof and targeted tests were executed; full notebook interaction requires user environment with Jupyter runtime."
	}
	},
	{
	"id": "F007",
	"name": "HuggingFace Deployment & Submission",
	"description": "Competition submission package: validate and push Docker to HF Spaces (openenv push), clean up GitHub repo (README, setup instructions, training notebook), write HF blog post outline (hook, problem, solution, results, technical), record/screenshot before-vs-after demo.",
	"complexity": "standard",
	"verification_mode": "mvp",
	"status": "complete",
	"priority": 7,
	"dependencies": [
	"F001",
	"F002",
	"F003",
	"F004",
	"F005",
	"F006"
	],
	"docs": {
	"discovery_json": null,
	"discovery_md": null,
	"design_doc": null,
	"delivery_spec": null
	},
	"taste": {
	"source": "user_interview",
	"notes": "Submission requirements from OpenEnv Challenge PDF and docs_draft/sql_env_project_brief.md Phase 5"
	},
	"user_interview": {
	"conducted": "2026-03-24T09:00:00Z",
	"skipped": false,
	"skip_reason": null,
	"value": {
	"question": "What will users be able to do that they couldn't before?",
	"response": "Judges can: read the blog, visit the HF Space, run the training notebook, and reproduce results. Someone outside the team can understand, use, and build on SQLEnv."
	},
	"experience": {
	"question": "Walk me through using this. What would delight you? What would frustrate you?",
	"delights": [
	"Blog tells a compelling story even if training results are modest",
	"HF Space just works — connect, reset, play an episode",
	"Training notebook runs end-to-end on Colab with one click"
	],
	"frustrations": [
	"Docker build fails on HF Spaces",
	"Blog is all technical, no narrative hook",
	"Notebook has undocumented setup steps"
	]
	},
	"maturity": {
	"question": "Is this exploratory, MVP, or production?",
	"response": "mvp",
	"rationale": "Ship what works. Polish can happen post-submission."
	}
	},
	"progress": {
	"implementation_steps": {
	"total": 6,
	"completed": 6
	},
	"verification_tests": {
	"total": 34,
	"passed": 250
	}
	},
	"specs": {
	"implementation": "specs/F007-IMPLEMENTATION_SPEC.md",
	"verification": "specs/F007-VERIFICATION_SPEC.md"
	},
	"timestamps": {
	"planned": "2026-03-27T12:00:00Z",
	"verification_planned": "2026-03-27T12:00:00Z",
	"started": "2026-03-28T17:03:38Z",
	"completed": "2026-03-29T07:29:32Z"
	},
	"verification_evidence": {
	"mode": "mvp",
	"tests_run": 250,
	"tests_passed": 250,
	"timestamp": "2026-03-29T07:29:32Z",
	"command": "uv run --with pytest pytest tests/ -v",
	"verifier_result": "approved"
	},
	"user_value": "Judges and external developers can now consume a complete SQLEnv submission package with HF Spaces-compatible deployment artifacts, a polished README quickstart, a structured blog outline, and a Colab-ready GRPO training notebook.",
	"demo": {
	"path": "specs/F007-DEMO.md",
	"generated_at": "2026-03-29T07:33:23Z",
	"mode": "infra_release",
	"status": "partial",
	"requires_user_verification": true,
	"verification_surfaces": [
	"local_manifest_validation",
	"local_docker_build",
	"external_registry_auth",
	"hf_space_push",
	"browser_episode_flow",
	"colab_notebook_run"
	],
	"evidence_refs": [
	"specs/F007-VERIFICATION_SPEC.md",
	"specs/F007-DEMO.md"
	],
	"note": "Authenticated local build and HF push now both succeed for hjerpe/sql_env; browser episode flow and Colab run remain user-verified surfaces."
	}
	},
	{
	"id": "F008",
	"name": "Synthetic Database Generation",
	"description": "Generate variant SQLite databases with same schema but different data for metamorphic testing. Implements 3 MVP mutations: irrelevant row injection, ID remapping, and duplicate bridge rows. Validates that gold SQL produces correct (potentially different) answers on variant DBs. Enables robustness testing against accidental correctness.",
	"complexity": "standard",
	"verification_mode": "mvp",
	"status": "complete",
	"priority": 8,
	"dependencies": [
	"F004"
	],
	"docs": {
	"discovery_json": null,
	"discovery_md": null,
	"design_doc": null,
	"delivery_spec": null
	},
	"taste": {
	"source": "user_interview",
	"notes": "Metamorphic testing from docs_draft/reward-research_gpt-5-2.md and docs_draft/SQLEnv_Concept_v1.md Section 6.2. Originally scoped as post-MVP but user requested as separate feature."
	},
	"user_interview": {
	"conducted": "2026-03-24T10:30:00Z",
	"skipped": false,
	"skip_reason": null,
	"value": {
	"question": "What will users be able to do that they couldn't before?",
	"response": "Verify that agent-produced SQL is semantically correct, not just accidentally correct on one dataset. Catches missing JOINs, wrong filters, and hard-coded values."
	},
	"experience": {
	"question": "Walk me through using this. What would delight you? What would frustrate you?",
	"delights": [
	"Script generates 1-2 variant DBs per question automatically",
	"Gold SQL still produces valid answers on variant DBs",
	"Catches real bugs: missing DISTINCT, wrong join direction"
	],
	"frustrations": [
	"Mutations break gold SQL (variant DB is invalid)",
	"Too many false positives from mutations",
	"Expensive to run during training"
	]
	},
	"maturity": {
	"question": "Is this exploratory, MVP, or production?",
	"response": "exploratory",
	"rationale": "Post-submission stretch goal. Only 3 mutations for MVP, evaluate impact before expanding."
	}
	},
	"progress": {
	"implementation_steps": {
	"total": 8,
	"completed": 8
	},
	"verification_tests": {
	"total": 61,
	"passed": 60
	}
	},
	"specs": {
	"implementation": "specs/F008-IMPLEMENTATION_SPEC.md",
	"verification": "specs/F008-VERIFICATION_SPEC.md"
	},
	"timestamps": {
	"planned": "2026-03-27T12:00:00Z",
	"verification_planned": "2026-03-27T12:00:00Z",
	"started": "2026-03-27T22:16:14Z",
	"completed": "2026-03-27T22:57:19Z"
	},
	"demo": {
	"path": "specs/F008-DEMO.md",
	"generated_at": "2026-03-27T22:55:58Z",
	"mode": "local_cli",
	"status": "generated",
	"requires_user_verification": false,
	"verification_surfaces": [
	"local_cli",
	"local_tests"
	],
	"evidence_refs": [
	"specs/F008-VERIFICATION_SPEC.md",
	"specs/F008-IMPLEMENTATION_SPEC.md"
	],
	"note": "Demo includes live CLI usage, edge/error cases, and supplementary local test run output."
	},
	"verification_evidence": {
	"mode": "mvp",
	"tests_run": 61,
	"tests_passed": 60,
	"timestamp": "2026-03-27T22:57:19Z",
	"command": "uv run pytest tests/ -v",
	"verifier_result": "approved"
	},
	"user_value": "Users can now generate synthetic Spider DB variants with schema-preserving data mutations and gold-SQL validation, enabling metamorphic checks that expose brittle SQL patterns like hard-coded IDs and missing DISTINCT."
	},
	{
	"id": "F009",
	"name": "Oracle Policy",
	"description": "Cheater/oracle policy that knows the gold SQL and answer. Plays optimal episodes: DESCRIBE relevant tables, execute gold SQL, submit answer. Validates reward ceiling (~1.3 expected) and provides upper-bound baseline for blog comparison (oracle vs trained vs random).",
	"complexity": "simple",
	"verification_mode": "mvp",
	"status": "complete",
	"priority": 9,
	"dependencies": [
	"F001",
	"F002"
	],
	"docs": {
	"discovery_json": null,
	"discovery_md": null,
	"design_doc": null,
	"delivery_spec": null
	},
	"taste": {
	"source": "user_interview",
	"notes": "From project plan: 'Cheater Policy — quick end-to-end test for maximum reward on environment'. Project brief Phase 2 done-when: 'A hardcoded cheat policy that knows the answer can achieve 100% success rate.'"
	},
	"user_interview": {
	"conducted": "2026-03-28T12:00:00Z",
	"skipped": false,
	"skip_reason": null,
	"value": {
	"question": "What will users be able to do that they couldn't before?",
	"response": "Validate that the environment reward ceiling works as designed. Oracle achieves ~100% success rate and ~1.3 total reward, confirming dense rewards stack correctly with terminal correctness. Provides upper-bound baseline for trained model comparison."
	},
	"experience": {
	"question": "Walk me through using this. What would delight you? What would frustrate you?",
	"delights": [
	"Oracle runs 100 episodes and reports near-perfect success rate",
	"Reward breakdown shows terminal + exploration adding up correctly",
	"Can compare oracle vs random vs trained in one table"
	],
	"frustrations": [
	"Oracle fails on questions where gold SQL is valid but gold answer extraction differs",
	"Oracle reward lower than expected, indicating reward bug"
	]
	},
	"maturity": {
	"question": "Is this exploratory, MVP, or production?",
	"response": "mvp",
	"rationale": "Validation tool for environment quality. Straightforward implementation — knows gold answer, submits it."
	}
	},
	"progress": {
	"implementation_steps": {
	"total": 2,
	"completed": 2
	},
	"verification_tests": {
	"total": 25,
	"passed": 40
	}
	},
	"specs": {
	"implementation": "specs/F009-IMPLEMENTATION_SPEC.md",
	"verification": "specs/F009-VERIFICATION_SPEC.md"
	},
	"timestamps": {
	"planned": "2026-03-28T12:00:00Z",
	"verification_planned": "2026-03-28T12:00:00Z",
	"started": "2026-03-28T17:06:05Z",
	"completed": "2026-03-28T17:14:17Z"
	},
	"demo": {
	"path": "specs/F009-DEMO.md",
	"generated_at": "2026-03-28T17:17:27Z",
	"mode": "artifact_build",
	"status": "partial",
	"requires_user_verification": true,
	"verification_surfaces": [
	"local_unit_tests",
	"package_export",
	"integration_e2e_followup"
	],
	"evidence_refs": [
	"specs/F009-VERIFICATION_SPEC.md",
	"specs/F009-IMPLEMENTATION_SPEC.md"
	],
	"note": "Strongest local proof is targeted/local pytest evidence; verification-spec integration/E2E file paths are not present in this workspace."
	},
	"verification_evidence": {
	"mode": "mvp",
	"tests_run": 40,
	"tests_passed": 40,
	"timestamp": "2026-03-28T17:14:17Z",
	"command": "uv run --with pytest pytest tests/unit/test_oracle_policy.py tests/test_evaluation.py -v",
	"verifier_result": "approved"
	},
	"user_value": "Users can now import and run OraclePolicy from sql_env.evaluation to produce a deterministic upper-bound baseline in evaluate(), validating reward-ceiling behavior and enabling direct oracle-vs-random-vs-trained comparisons."
	},
	{
	"id": "F010",
	"name": "TRL Environment Adapter",
	"description": "Wrap SQLEnv as a TRL-compatible environment_factory class. Public methods (describe, sample, query, answer) become LLM-callable tools automatically. Includes reset(**kwargs) for episode initialization, reward accumulation for reward_func, and concurrent session support (max_concurrent_envs). Replaces need for custom rollout_func in F006.",
	"complexity": "standard",
	"verification_mode": "mvp",
	"status": "complete",
	"priority": 10,
	"dependencies": [
	"F001",
	"F003"
	],
	"docs": {
	"discovery_json": null,
	"discovery_md": null,
	"design_doc": null,
	"delivery_spec": null
	},
	"taste": {
	"source": "user_interview",
	"notes": "Derived from TRL OpenEnv docs (https://huggingface.co/docs/trl/main/openenv). environment_factory is the recommended pattern over rollout_func."
	},
	"user_interview": {
	"conducted": "2026-03-28T12:00:00Z",
	"skipped": false,
	"skip_reason": null,
	"value": {
	"question": "What will users be able to do that they couldn't before?",
	"response": "Train any HuggingFace model against SQLEnv using standard TRL GRPOTrainer with environment_factory. No custom rollout code needed — TRL handles generation, tool parsing, and multi-turn loop automatically."
	},
	"experience": {
	"question": "Walk me through using this. What would delight you? What would frustrate you?",
	"delights": [
	"Pass SQLEnvTRL as environment_factory to GRPOTrainer and it works",
	"Tool methods have typed docstrings so TRL auto-discovers them",
	"Concurrent sessions handle parallel rollouts without contention"
	],
	"frustrations": [
	"Tool method signatures don't match what TRL expects",
	"Environment state leaks between episodes",
	"Concurrent sessions cause SQLite locking errors"
	]
	},
	"maturity": {
	"question": "Is this exploratory, MVP, or production?",
	"response": "mvp",
	"rationale": "Must work for competition demo. Concurrent sessions can start with modest parallelism (4-8)."
	}
	},
	"progress": {
	"implementation_steps": {
	"total": 5,
	"completed": 6
	},
	"verification_tests": {
	"total": 48,
	"passed": 287
	}
	},
	"specs": {
	"implementation": "specs/F010-IMPLEMENTATION_SPEC.md",
	"verification": "specs/F010-VERIFICATION_SPEC.md"
	},
	"timestamps": {
	"planned": "2026-03-28T12:00:00Z",
	"verification_planned": "2026-03-28T12:00:00Z",
	"started": "2026-03-28T17:05:54Z",
	"completed": "2026-03-28T17:29:10Z"
	},
	"verification_evidence": {
	"mode": "mvp",
	"tests_run": 288,
	"tests_passed": 287,
	"timestamp": "2026-03-28T17:29:10Z",
	"command": "uv run --with pytest pytest tests/ -v",
	"verifier_result": "approved"
	},
	"demo": {
	"path": "specs/F010-DEMO.md",
	"generated_at": "2026-03-28T17:31:44Z",
	"mode": "artifact_build",
	"status": "partial",
	"requires_user_verification": true,
	"verification_surfaces": [
	"local_pytest_verification",
	"real_trl_training_run",
	"concurrent_rollout_runtime"
	],
	"evidence_refs": [
	"specs/F010-VERIFICATION_SPEC.md",
	"specs/F010-DEMO.md"
	],
	"note": "Strongest local proof is targeted test execution; full confidence still requires user-run TRL training and concurrency validation."
	},
	"user_value": "Users can now train TRL/GRPO policies against SQLEnv via native environment_factory tool-calling with SQLEnvTRL, without maintaining a custom rollout loop."
	},
	{
	"id": "F011",
	"name": "Prompting Baseline Notebook",
	"description": "New notebook (notebooks/showcase_prompting.ipynb) demonstrating base model performance on SQL tasks using only prompt engineering — no training. Serves as a baseline comparison for the GRPO-trained model. Sections: (1) Zero-shot with tool definitions, (2) Few-shot in-context learning with example trajectories from SFT data, (3) Chain-of-thought prompting, (4) Evaluation on held-out eval set across all techniques, (5) Accuracy comparison table + bar chart, (6) Optional side-by-side with trained model checkpoint.",
	"complexity": "standard",
	"verification_mode": "mvp",
	"status": "complete",
	"priority": 11,
	"dependencies": [
	"F006",
	"F010"
	],
	"docs": {
	"discovery_json": null,
	"discovery_md": null,
	"design_doc": null,
	"delivery_spec": null
	},
	"taste": {
	"source": "user_interview",
	"notes": "User wants to demonstrate that training adds value over pure prompting. Key insight: this notebook makes the GRPO training story more compelling by showing the gap."
	},
	"user_interview": {
	"conducted": "2026-04-02T08:27:55+00:00",
	"skipped": false,
	"skip_reason": null,
	"value": {
	"question": "What will users be able to do that they could not before?",
	"response": "See exactly how much the base model can do with prompting alone, making the GRPO training improvement measurable and the notebook more convincing as a demo."
	},
	"experience": {
	"question": "Walk me through using this. What would delight you? What would frustrate you?",
	"delights": [
	"Clear accuracy comparison table across techniques",
	"Same eval set used for all methods (fair comparison)",
	"Can load a trained checkpoint for side-by-side",
	"Runs on Colab without training (fast demo)"
	],
	"frustrations": [
	"Eval taking too long (should be lightweight)",
	"Unclear what prompting technique is being used",
	"No visual comparison (just numbers)"
	]
	},
	"maturity": {
	"question": "Is this exploratory, MVP, or production?",
	"response": "mvp",
	"rationale": "Demonstrates the value proposition of training. Can iterate on techniques later."
	}
	},
	"progress": {
	"implementation_steps": {
	"total": 7,
	"completed": 7
	},
	"verification_tests": {
	"total": 36,
	"passed": 17
	}
	},
	"specs": {
	"implementation": "specs/F011-IMPLEMENTATION_SPEC.md",
	"verification": "specs/F011-VERIFICATION_SPEC.md"
	},
	"timestamps": {
	"planned": "2026-04-06T08:27:07.093218+00:00",
	"verification_planned": "2026-04-06T08:27:07.093218+00:00",
	"started": "2026-04-06T19:09:21Z",
	"completed": "2026-04-07T05:10:40Z"
	},
	"verification_evidence": {
	"mode": "mvp",
	"tests_run": 17,
	"tests_passed": 17,
	"timestamp": "2026-04-07T05:10:40Z",
	"command": "uv run pytest tests/test_evaluation.py -v",
	"verifier_result": "approved"
	},
	"user_value": "Users can now run one notebook that fairly compares zero-shot/1-shot/3-shot prompting against GRPO no-think and GRPO thinking checkpoints on the same eval subset, with both tabular metrics and a visual accuracy bar chart.",
	"demo": {
	"path": "specs/F011-DEMO.md",
	"generated_at": "2026-04-07T05:12:46Z",
	"mode": "artifact_build",
	"status": "partial",
	"requires_user_verification": true,
	"verification_surfaces": [
	"local_notebook_execution",
	"local_visual_artifact_export",
	"interactive_notebook_run",
	"hf_checkpoint_access"
	],
	"evidence_refs": [
	"specs/F011-VERIFICATION_SPEC.md",
	"specs/F011-DEMO.md"
	],
	"note": "Notebook execution was attempted locally but failed in this environment; static visual artifact export succeeded, and full interactive chart/table validation remains a user-run check."
	}
	},
	{
	"id": "F012",
	"name": "Enable Thinking Mode",
	"description": "Remove /no_think suppression and enable_thinking=False so Qwen3 can reason during GRPO rollouts. Model currently generates empty <think> blocks and cannot reason about SQL errors (repeats same failing query verbatim). Enables pretrained reasoning capability via reward signal — SFT data unchanged.",
	"complexity": "simple",
	"verification_mode": "mvp",
	"status": "not_started",
	"priority": 12,
	"dependencies": [],
	"docs": {
	"discovery_json": null,
	"discovery_md": null,
	"design_doc": null,
	"delivery_spec": null
	},
	"taste": {
	"source": "none",
	"notes": "Discovered during Run 6 analysis: model repeats failing queries because it cannot reason about errors"
	},
	"user_interview": {
	"conducted": "2026-04-04T05:32:07+00:00",
	"skipped": true,
	"skip_reason": "Simple config change — 3 files, clear pattern",
	"value": null,
	"experience": null,
	"maturity": null
	},
	"progress": {
	"implementation_steps": {
	"total": 0,
	"completed": 0
	},
	"verification_tests": {
	"total": 0,
	"passed": 0
	}
	},
	"specs": {
	"implementation": null,
	"verification": null
	},
	"inline_spec": {
	"files": [
	"scripts/generate_sft_data.py",
	"notebooks/train_grpo.ipynb",
	"training/notebook_pipeline.py"
	],
	"description": "Remove /no_think from SYSTEM_PROMPT in SFT and GRPO. Change enable_thinking: False to True in notebook_pipeline.py chat_template_kwargs. Regenerate SFT data.",
	"verification": "Run training on Colab — verify model produces non-empty <think> blocks and changes SQL after errors"
	},
	"timestamps": {
	"planned": "2026-04-04T05:32:07+00:00",
	"verification_planned": null,
	"started": null,
	"completed": null
	},
	"verification_evidence": null,
	"user_value": null
	},
	{
	"id": "F013",
	"name": "Error-Recovery SFT Trajectories",
	"description": "Add 15-20 SFT trajectories to generate_sft_data.py showing error recovery: model queries with wrong column/table → gets SQL error → re-examines schema via describe/sample → writes corrected query → submits correct answer. Teaches the base policy to recover from mistakes before GRPO, so KL-anchored exploration includes error recovery as a learned pattern.",
	"complexity": "standard",
	"verification_mode": "standard",
	"status": "complete",
	"priority": 13,
	"dependencies": [],
	"docs": {
	"discovery_json": null,
	"discovery_md": null,
	"design_doc": null,
	"delivery_spec": null
	},
	"taste": {
	"source": "none",
	"notes": "Run 7 analysis: error loops are the #1 reward killer. Model repeats same failing query 3-8x because SFT only shows happy paths. No error-recovery pattern in base policy."
	},
	"user_interview": {
	"conducted": "2026-04-04T11:35:48+00:00",
	"skipped": true,
	"skip_reason": "Pattern clear from Run 7 rollout analysis — model needs error-recovery examples in SFT data",
	"value": null,
	"experience": null,
	"maturity": null
	},
	"progress": {
	"implementation_steps": {
	"total": 4,
	"completed": 4
	},
	"verification_tests": {
	"total": 55,
	"passed": 55
	}
	},
	"specs": {
	"implementation": "specs/F013-IMPLEMENTATION_SPEC.md",
	"verification": "specs/F013-VERIFICATION_SPEC.md"
	},
	"timestamps": {
	"planned": "2026-04-04T11:50:45+00:00",
	"verification_planned": "2026-04-04T11:50:45+00:00",
	"started": "2026-04-04T14:10:09Z",
	"completed": "2026-04-04T18:20:00Z"
	},
	"verification_evidence": {
	"mode": "standard",
	"tests_run": 2,
	"tests_passed": 2,
	"timestamp": "2026-04-04T18:20:00Z",
	"command": "uv run pytest tests/unit/test_sft_terminal_message.py -v && uv run python scripts/generate_sft_data.py"
	},
	"user_value": null
	},
	{
	"id": "F014",
	"name": "Stop-After-Correct SFT Trajectories",
	"description": "Add 5-10 SFT trajectories where the model answers correctly and the conversation ends cleanly — no post-episode tool calls. Currently all SFT examples end with the tool response 'Answer submitted: correct.' but the model still generates extra calls afterward during GRPO. Explicitly training on clean episode endings teaches the stop signal.",
	"complexity": "simple",
	"verification_mode": "mvp",
	"status": "complete",
	"priority": 14,
	"dependencies": [
	"F013"
	],
	"docs": {
	"discovery_json": null,
	"discovery_md": null,
	"design_doc": null,
	"delivery_spec": null
	},
	"taste": {
	"source": "none",
	"notes": "Run 7: model makes 1-3 extra calls after correct answer despite -0.3 post-episode penalty. SFT ending is ambiguous — model sees tool response but has no 'done generating' signal."
	},
	"user_interview": {
	"conducted": "2026-04-04T11:35:48+00:00",
	"skipped": true,
	"skip_reason": "Simple extension of generate_sft_data.py — add final assistant turn with no tool call",
	"value": null,
	"experience": null,
	"maturity": null
	},
	"progress": {
	"implementation_steps": {
	"total": 1,
	"completed": 1
	},
	"verification_tests": {
	"total": 21,
	"passed": 2
	}
	},
	"specs": {
	"implementation": "specs/F014-IMPLEMENTATION_SPEC.md",
	"verification": "specs/F014-VERIFICATION_SPEC.md"
	},
	"inline_spec": {
	"files": [
	"scripts/generate_sft_data.py"
	],
	"description": "After the final 'Answer submitted: correct.' tool response, do NOT append another assistant turn. The SFT example ends at the tool response. TRL's assistant_only_loss means the model only trains on assistant turns, so ending after the final tool response teaches the model that no further generation is needed. Alternatively, add a short assistant turn with just a stop token or empty content.",
	"verification": "Inspect rendered SFT data — confirm examples end after correct answer tool response. Run GRPO training and check post-episode call count decreases."
	},
	"timestamps": {
	"planned": "2026-04-04T11:48:20+00:00",
	"verification_planned": "2026-04-04T11:48:20+00:00",
	"started": "2026-04-04T14:17:03Z",
	"completed": "2026-04-04T14:17:03Z"
	},
	"verification_evidence": {
	"mode": "mvp",
	"tests_run": 2,
	"tests_passed": 2,
	"timestamp": "2026-04-04T14:17:03Z",
	"command": "uv run pytest tests/unit/test_sft_terminal_message.py -v",
	"verifier_result": "approved"
	},
	"demo": {
	"path": "specs/F014-DEMO.md",
	"generated_at": "2026-04-04T14:21:55Z",
	"mode": "artifact_build",
	"status": "generated",
	"requires_user_verification": true,
	"verification_surfaces": [
	"local_sft_generation",
	"artifact_inspection",
	"training_runtime_behavior"
	],
	"evidence_refs": [
	"specs/F014-VERIFICATION_SPEC.md",
	"specs/F014-DEMO.md"
	],
	"note": "Local SFT artifact and terminal-message shape are verified; reduction in post-answer calls must be confirmed in GRPO runtime."
	},
	"user_value": "SFT trajectories now end with an explicit terminal assistant message after correct answer confirmation, teaching a clear stop pattern that helps reduce extra post-answer tool calls during GRPO."
	},
	{
	"id": "F015",
	"name": "Error-Repetition Penalty",
	"description": "In trl_adapter.py, track recent tool calls (function name + arguments) in a short window. When the model makes an exact repeat of any recent call, apply -0.2 penalty. Uses trajectory-level reward aggregation — safe for GRPO (no Markov violation because GRPO uses Monte Carlo returns, not Bellman bootstrapping, and the model's context window already contains full history as augmented state).",
	"complexity": "simple",
	"verification_mode": "standard",
	"status": "complete",
	"priority": 15,
	"dependencies": [],
	"docs": {
	"discovery_json": null,
	"discovery_md": null,
	"design_doc": null,
	"delivery_spec": null
	},
	"taste": {
	"source": "none",
	"notes": "Run 7: model repeats exact same failing query 3-8 times. -0.2 per repeat is moderate enough to avoid the repeat-avoidance trap (preferring novel-but-wrong over correct retry). Exact-match comparison (function+args string equality) is simple and sufficient."
	},
	"user_interview": {
	"conducted": "2026-04-04T11:35:48+00:00",
	"skipped": true,
	"skip_reason": "Small code change in trl_adapter.py — add _recent_calls tracking and repeat penalty",
	"value": null,
	"experience": null,
	"maturity": null
	},
	"progress": {
	"implementation_steps": {
	"total": 2,
	"completed": 2
	},
	"verification_tests": {
	"total": 55,
	"passed": 55
	}
	},
	"specs": {
	"implementation": "specs/F015-IMPLEMENTATION_SPEC.md",
	"verification": "specs/F015-VERIFICATION_SPEC.md"
	},
	"inline_spec": {
	"files": [
	"training/trl_adapter.py",
	"tests/unit/test_trl_adapter.py",
	"training/rollout.py",
	"training/notebook_pipeline.py",
	"notebooks/train_grpo.ipynb"
	],
	"description": "Add self._recent_calls: collections.deque[tuple[str, str]] with maxlen=3 and self._repeat_count: int in __init__. In each tool method (describe, query, sample, answer), before executing: build call_key = (method_name, arg_value). If call_key appears in self._recent_calls, apply _REPEAT_PENALTY = -0.2 and increment self._repeat_count. Always append call_key after execution. Reset self._recent_calls and self._repeat_count in reset().",
	"verification": "Unit test: call query('SELECT 1') twice in a row, verify reward includes -0.2 repeat penalty. Call query('SELECT 1') then query('SELECT 2'), verify no penalty."
	},
	"timestamps": {
	"planned": "2026-04-04T11:47:59+00:00",
	"verification_planned": "2026-04-04T11:47:59+00:00",
	"started": "2026-04-05T05:23:09Z",
	"completed": "2026-04-05T05:43:04Z"
	},
	"verification_evidence": {
	"mode": "standard",
	"tests_run": 55,
	"tests_passed": 55,
	"timestamp": "2026-04-05T05:43:04Z",
	"command": "uv run pytest tests/unit/test_trl_adapter.py -v && uv run pytest tests/unit/test_trl_adapter.py -v -k \"repeat or last_call\" && uv run pytest tests/e2e/test_training_e2e.py -v",
	"verifier_result": "approved"
	},
	"demo": {
	"path": "specs/F015-DEMO.md",
	"generated_at": "2026-04-05T05:50:52Z",
	"mode": "artifact_build",
	"status": "generated",
	"requires_user_verification": true,
	"verification_surfaces": [
	"local_pytest_verification",
	"training_runtime_behavior"
	],
	"evidence_refs": [
	"specs/F015-VERIFICATION_SPEC.md",
	"specs/F015-DEMO.md"
	],
	"note": "Strongest local proof is targeted/full pytest and training e2e smoke; reduced repeat loops in live GRPO trajectories still requires user runtime confirmation."
	},
	"user_value": "Agents now receive a deterministic repeat-call penalty for reused tool calls within a short recent-call window (including alternating reuse), reducing degenerate GRPO loops while preserving non-repeated exploration behavior."
	},
	{
	"id": "F016",
	"name": "Pre-Publication Code Quality Sweep",
	"description": "Refactor, lint fixes, and code smell cleanup before blog post publication. Runs ruff --fix, removes dead code, fixes line lengths, and addresses unused variables. Staff review of core modules (reward, verifier, trl_adapter, sql_environment) for correctness and clarity.",
	"complexity": "simple",
	"verification_mode": "mvp",
	"status": "not_started",
	"priority": 1,
	"dependencies": [],
	"docs": {
	"discovery_json": null,
	"discovery_md": null,
	"design_doc": null,
	"delivery_spec": null
	},
	"taste": {
	"source": "user_interview",
	"notes": "Blog deadline tomorrow — codebase must be presentable for open-source judges"
	},
	"user_interview": {
	"conducted": "2026-04-11T15:55:16Z",
	"skipped": false,
	"skip_reason": null,
	"value": {
	"question": "What will users be able to do that they couldn't before?",
	"response": "Judges and readers reviewing the GitHub repo will see clean, well-linted code without obvious smells. The codebase matches the quality story told in the blog post."
	},
	"experience": {
	"question": "Walk me through using this. What would delight you? What would frustrate you?",
	"delights": [
	"Zero ruff errors on clone",
	"No dead imports or unused variables",
	"Core modules pass a staff-level review"
	],
	"frustrations": [
	"Visible linting errors in the repo judges clone",
	"Commented-out code or debug prints left in",
	"Inconsistent formatting between files"
	]
	},
	"maturity": {
	"question": "Is this exploratory, MVP, or production?",
	"response": "mvp",
	"rationale": "Ship-blocking cleanup, not a deep refactor. Fix what's visible, don't reorganize."
	}
	},
	"progress": {
	"implementation_steps": {
	"total": 4,
	"completed": 0
	},
	"verification_tests": {
	"total": 2,
	"passed": 0
	}
	},
	"specs": {
	"implementation": null,
	"verification": null
	},
	"inline_spec": {
	"files": [
	"server/sql_environment.py",
	"server/verifier.py",
	"server/reward.py",
	"training/trl_adapter.py",
	"training/config.py",
	"training/notebook_pipeline.py",
	"training/data_loading.py",
	"evaluation/policies.py",
	"evaluation/runner.py",
	"scripts/generate_sft_data.py",
	"tests/"
	],
	"description": "Four steps: (1) ruff check --fix + ruff format, (2) manual fix remaining lint errors (line length, unused vars, dead imports), (3) spec-staff-review on core modules, (4) address review findings. Inline verification: ruff check passes with 0 errors, all existing tests pass.",
	"verification": "ruff check . returns 0 errors; uv run python -m pytest tests/ passes; staff review findings addressed or documented"
	},
	"timestamps": {
	"planned": "2026-04-11T15:55:16Z",
	"verification_planned": null,
	"started": null,
	"completed": null
	},
	"verification_evidence": null,
	"user_value": null
	}
	]
	}