| { |
| "$schema": "./schemas/autocode-features-v1.schema.json", |
| "project": "SQLEnv - Interactive Database Query RL Environment", |
| "description": "OpenEnv Challenge submission: RL environment where agents learn to answer NL questions about databases through iterative SQL exploration", |
| "created": "2026-03-24T07:15:50Z", |
| "updated": "2026-04-11T15:55:16Z", |
| "features": [ |
| { |
| "id": "F001", |
| "name": "Core Environment Loop", |
| "description": "Complete the step/reset lifecycle: remove Ollama from environment, accept structured actions (DESCRIBE table_name, SAMPLE table_name, QUERY sql_string, ANSWER value), wire up SQLite execution with sandboxing (read-only, 5s timeout, SELECT-only), load questions from JSON on reset(), enforce step budget (15 steps), handle episode termination", |
| "complexity": "complex", |
| "verification_mode": "standard", |
| "status": "complete", |
| "priority": 1, |
| "dependencies": [], |
| "docs": { |
| "discovery_json": null, |
| "discovery_md": null, |
| "design_doc": null, |
| "delivery_spec": null |
| }, |
| "taste": { |
| "source": "user_interview", |
| "notes": "Derived from docs_draft/sql_env_project_brief.md and docs_draft/SQLEnv_Concept_v1.md — the v1 spec defines the action space, episode lifecycle, and sandboxing requirements" |
| }, |
| "user_interview": { |
| "conducted": "2026-03-24T09:00:00Z", |
| "skipped": false, |
| "skip_reason": null, |
| "value": { |
| "question": "What will users be able to do that they couldn't before?", |
| "response": "Agents can play complete episodes: reset with a random question, explore a hidden schema via DESCRIBE/SAMPLE, run SQL queries, and submit answers. Currently SQL never executes — this makes the environment actually functional." |
| }, |
| "experience": { |
| "question": "Walk me through using this. What would delight you? What would frustrate you?", |
| "delights": [ |
| "Agent sends DESCRIBE employees and immediately sees column names and types", |
| "Queries execute in <100ms with clean truncated output (max 20 rows)", |
| "Bad SQL returns a clear error message the agent can learn from", |
| "Episode ends cleanly when budget exhausted or ANSWER submitted" |
| ], |
| "frustrations": [ |
| "Environment calling Ollama to interpret actions (current design) — agent should own reasoning, env should just execute", |
| "Queries hanging or crashing the environment", |
| "Opaque error messages that don't help the agent adjust" |
| ] |
| }, |
| "maturity": { |
| "question": "Is this exploratory, MVP, or production?", |
| "response": "mvp", |
| "rationale": "Competition submission — needs to work reliably for demo and training, not at production scale" |
| } |
| }, |
| "progress": { |
| "implementation_steps": { |
| "total": 8, |
| "completed": 8 |
| }, |
| "verification_tests": { |
| "total": 86, |
| "passed": 25 |
| } |
| }, |
| "specs": { |
| "implementation": "specs/F001-IMPLEMENTATION_SPEC.md", |
| "verification": "specs/F001-VERIFICATION_SPEC.md" |
| }, |
| "timestamps": { |
| "planned": "2026-03-24T10:30:00Z", |
| "verification_planned": "2026-03-24T10:30:00Z", |
| "started": "2026-03-24T19:22:08Z", |
| "completed": "2026-03-24T21:27:31Z" |
| }, |
| "verification_evidence": { |
| "mode": "standard", |
| "tests_run": 25, |
| "tests_passed": 25, |
| "timestamp": "2026-03-24T21:27:31Z", |
| "command": "uv run pytest tests/ -v", |
| "verifier_result": "approved" |
| }, |
| "demo": { |
| "path": "specs/F001-DEMO.md", |
| "generated_at": "2026-03-24T21:36:32Z", |
| "mode": "local_cli", |
| "status": "partial", |
| "requires_user_verification": true, |
| "verification_surfaces": [ |
| "local_server_startup", |
| "data_provisioning", |
| "api_episode_flow" |
| ], |
| "evidence_refs": [ |
| "specs/F001-VERIFICATION_SPEC.md", |
| "specs/F001-DEMO.md" |
| ], |
| "note": "Local server and tests verified; end-to-end API episode flow requires local Spider DB provisioning." |
| }, |
| "user_value": "Agents can now run complete SQL exploration episodes end-to-end with structured DESCRIBE/SAMPLE/QUERY/ANSWER actions, live read-only SQLite execution, clear error feedback, and clean terminal completion on ANSWER or budget exhaustion." |
| }, |
| { |
| "id": "F002", |
| "name": "Answer Verification", |
| "description": "Multi-type answer comparison: integer (exact match), float (1% tolerance), string (case-insensitive normalized), list (order-insensitive set comparison). Implements verify_answer() in server/verifier.py. Returns binary correctness for terminal reward.", |
| "complexity": "standard", |
| "verification_mode": "standard", |
| "status": "complete", |
| "priority": 2, |
| "dependencies": [ |
| "F001" |
| ], |
| "docs": { |
| "discovery_json": null, |
| "discovery_md": null, |
| "design_doc": null, |
| "delivery_spec": null |
| }, |
| "taste": { |
| "source": "user_interview", |
| "notes": "Answer type handling defined in docs_draft/SQLEnv_Concept_v1.md Section 4.2" |
| }, |
| "user_interview": { |
| "conducted": "2026-03-24T09:00:00Z", |
| "skipped": false, |
| "skip_reason": null, |
| "value": { |
| "question": "What will users be able to do that they couldn't before?", |
| "response": "When an agent submits ANSWER, the environment correctly determines if the answer matches the gold answer regardless of type (42 vs 42.0, 'Engineering' vs 'engineering', unordered lists)." |
| }, |
| "experience": { |
| "question": "Walk me through using this. What would delight you? What would frustrate you?", |
| "delights": [ |
| "Float comparison with tolerance handles rounding gracefully (95000.1 matches 95000)", |
| "List comparison ignores order: ['A','B'] matches ['B','A']", |
| "Clear pass/fail with no ambiguity" |
| ], |
| "frustrations": [ |
| "Correct answer rejected due to trivial formatting difference", |
| "Type coercion failures (agent says '42', gold is integer 42)" |
| ] |
| }, |
| "maturity": { |
| "question": "Is this exploratory, MVP, or production?", |
| "response": "mvp", |
| "rationale": "Must handle the 4 core answer types reliably. Table comparison can come later." |
| } |
| }, |
| "progress": { |
| "implementation_steps": { |
| "total": 4, |
| "completed": 4 |
| }, |
| "verification_tests": { |
| "total": 65, |
| "passed": 65 |
| } |
| }, |
| "specs": { |
| "implementation": "specs/F002-IMPLEMENTATION_SPEC.md", |
| "verification": "specs/F002-VERIFICATION_SPEC.md" |
| }, |
| "timestamps": { |
| "planned": "2026-03-27T12:00:00Z", |
| "verification_planned": "2026-03-27T12:00:00Z", |
| "started": "2026-03-27T22:18:15Z", |
| "completed": "2026-03-27T22:33:12Z" |
| }, |
| "verification_evidence": { |
| "mode": "standard", |
| "tests_run": 65, |
| "tests_passed": 65, |
| "timestamp": "2026-03-27T22:33:12Z", |
| "command": "uv run pytest tests/ -v", |
| "verifier_result": "approved" |
| }, |
| "demo": { |
| "path": "specs/F002-DEMO.md", |
| "generated_at": "2026-03-27T22:37:50Z", |
| "mode": "artifact_build", |
| "status": "partial", |
| "requires_user_verification": true, |
| "verification_surfaces": [ |
| "local_pytest_verification", |
| "runtime_episode_scoring" |
| ], |
| "evidence_refs": [ |
| "specs/F002-VERIFICATION_SPEC.md", |
| "specs/F002-DEMO.md" |
| ], |
| "note": "Strongest local proof is targeted and integration pytest evidence; final runtime confirmation remains a user-operated episode check." |
| }, |
| "user_value": "Agents can now submit ANSWER values across integer, float, string, and list questions and receive correct terminal scoring despite formatting differences, numeric representation differences, and list order changes." |
| }, |
| { |
| "id": "F003", |
| "name": "Dense Reward System", |
| "description": "3-layer reward architecture: Layer 1 (operational validity: exec_ok +0.02, new_info +0.01 capped at 0.10, repeat -0.01, step_cost -0.005), Layer 2 (progress-to-target: weighted average of cardinality matching + value overlap + numeric range proximity, binned to 5 levels, improvement-only), Layer 3 (terminal correctness: +1.0 or 0.0). Total step rewards capped at 0.5, negative floor at -0.2.", |
| "complexity": "complex", |
| "verification_mode": "standard", |
| "status": "complete", |
| "priority": 3, |
| "dependencies": [ |
| "F001", |
| "F002" |
| ], |
| "docs": { |
| "discovery_json": null, |
| "discovery_md": null, |
| "design_doc": null, |
| "delivery_spec": null |
| }, |
| "taste": { |
| "source": "user_interview", |
| "notes": "Reward architecture defined in docs_draft/SQLEnv_Concept_v1.md Section 3 and docs_draft/reward-research_gpt-5-2.md. Distance metrics detailed in docs_draft/reward_design.md." |
| }, |
| "user_interview": { |
| "conducted": "2026-03-24T09:00:00Z", |
| "skipped": false, |
| "skip_reason": null, |
| "value": { |
| "question": "What will users be able to do that they couldn't before?", |
| "response": "Agents get meaningful feedback during exploration — not just 0/1 at the end. A query that returns 40 when the answer is 42 gets partial credit. Discovering new schema info gets a small reward. This makes GRPO training converge." |
| }, |
| "experience": { |
| "question": "Walk me through using this. What would delight you? What would frustrate you?", |
| "delights": [ |
| "Reward varies meaningfully: random exploration ~0.1, targeted queries ~0.3, correct answer ~1.3", |
| "Anti-gaming works: agent can't farm rewards by describing everything or repeating queries", |
| "Progress signal is coarsened to prevent reward hill-climbing" |
| ], |
| "frustrations": [ |
| "Reward hacking: agent learns to exploit shaping rather than solve the task", |
| "Reward too sparse: agent gets no signal until terminal step", |
| "Over-complex reward that's hard to debug" |
| ] |
| }, |
| "maturity": { |
| "question": "Is this exploratory, MVP, or production?", |
| "response": "mvp", |
| "rationale": "Start with weighted average of 3 metrics (cardinality, value overlap, numeric range). Add complexity only if training shows issues." |
| } |
| }, |
| "progress": { |
| "implementation_steps": { |
| "total": 7, |
| "completed": 7 |
| }, |
| "verification_tests": { |
| "total": 61, |
| "passed": 166 |
| } |
| }, |
| "specs": { |
| "implementation": "specs/F003-IMPLEMENTATION_SPEC.md", |
| "verification": "specs/F003-VERIFICATION_SPEC.md" |
| }, |
| "timestamps": { |
| "planned": "2026-03-27T12:00:00Z", |
| "verification_planned": "2026-03-27T12:00:00Z", |
| "started": "2026-03-27T23:51:47Z", |
| "completed": "2026-03-28T06:05:02Z" |
| }, |
| "verification_evidence": { |
| "mode": "standard", |
| "tests_run": 166, |
| "tests_passed": 166, |
| "timestamp": "2026-03-28T06:05:02Z", |
| "command": "uv run --with pytest pytest tests/ -v", |
| "verifier_result": "approved" |
| }, |
| "demo": { |
| "path": "specs/F003-DEMO.md", |
| "generated_at": "2026-03-28T06:07:34Z", |
| "mode": "artifact_build", |
| "status": "generated", |
| "requires_user_verification": true, |
| "verification_surfaces": [ |
| "local_pytest_verification", |
| "runtime_episode_flow" |
| ], |
| "evidence_refs": [ |
| "specs/F003-VERIFICATION_SPEC.md", |
| "specs/F003-DEMO.md" |
| ], |
| "note": "Strongest local proof is targeted smoke/unit execution; full reward calibration and live episode behavior should be confirmed in a user-run episode/training context." |
| }, |
| "user_value": "Agents now receive dense numeric rewards on every non-terminal DESCRIBE/SAMPLE/QUERY step based on execution quality and progress toward the gold answer, while terminal correctness still dominates total episode reward." |
| }, |
| { |
| "id": "F004", |
| "name": "Question Dataset Expansion", |
| "description": "Expand from 53 questions (one DB) to 100+ questions across 5-10 Spider databases. Add difficulty labels (easy/medium/hard at 40/40/20 split), answer_type metadata, and gold_answer fields. Create train/eval split (70/30). Curate for diversity of answer types and SQL patterns.", |
| "complexity": "standard", |
| "verification_mode": "mvp", |
| "status": "complete", |
| "priority": 4, |
| "dependencies": [], |
| "docs": { |
| "discovery_json": null, |
| "discovery_md": null, |
| "design_doc": null, |
| "delivery_spec": null |
| }, |
| "taste": { |
| "source": "user_interview", |
| "notes": "Dataset requirements from docs_draft/sql_env_project_brief.md Section 3 and SQLEnv_Concept_v1.md Section 4" |
| }, |
| "user_interview": { |
| "conducted": "2026-03-24T09:00:00Z", |
| "skipped": false, |
| "skip_reason": null, |
| "value": { |
| "question": "What will users be able to do that they couldn't before?", |
| "response": "Training on diverse databases and question types. Current single-DB setup risks overfitting to one schema." |
| }, |
| "experience": { |
| "question": "Walk me through using this. What would delight you? What would frustrate you?", |
| "delights": [ |
| "Clear difficulty progression: easy questions have 1-2 tables, hard ones have 5+", |
| "Each question has pre-computed gold_answer so reward doesn't need to re-execute gold SQL every episode", |
| "Train/eval split prevents training on evaluation data" |
| ], |
| "frustrations": [ |
| "Questions that require SQL features SQLite doesn't support", |
| "Ambiguous gold answers (multiple valid interpretations)", |
| "All questions from same domain = no generalization" |
| ] |
| }, |
| "maturity": { |
| "question": "Is this exploratory, MVP, or production?", |
| "response": "mvp", |
| "rationale": "100 well-curated questions is sufficient for competition demo. Quality over quantity." |
| } |
| }, |
| "progress": { |
| "implementation_steps": { |
| "total": 6, |
| "completed": 6 |
| }, |
| "verification_tests": { |
| "total": 66, |
| "passed": 21 |
| } |
| }, |
| "specs": { |
| "implementation": "specs/F004-IMPLEMENTATION_SPEC.md", |
| "verification": "specs/F004-VERIFICATION_SPEC.md" |
| }, |
| "demo": { |
| "path": "specs/F004-DEMO.md", |
| "generated_at": "2026-03-24T21:07:31Z" |
| }, |
| "timestamps": { |
| "planned": "2026-03-24T10:30:00Z", |
| "verification_planned": "2026-03-24T10:30:00Z", |
| "started": "2026-03-24T16:53:35Z", |
| "completed": "2026-03-24T21:04:54Z" |
| }, |
| "verification_evidence": { |
| "mode": "mvp", |
| "tests_run": 21, |
| "tests_passed": 21, |
| "timestamp": "2026-03-24T21:04:54Z", |
| "command": "uv run pytest tests/ -v", |
| "verifier_result": "approved" |
| }, |
| "user_value": "Users can now train and evaluate against a curated multi-database dataset (676 questions across 10 Spider databases) with precomputed gold answers, answer types, difficulty labels, and deterministic train/eval splits." |
| }, |
| { |
| "id": "F005", |
| "name": "Green Agent Wrapper", |
| "description": "Automated evaluation wrapper following OpenEnv pattern. Runs N episodes with a given policy (random, heuristic, or trained model). Reports success_rate, avg_reward, avg_steps. Supports random baseline policy for comparison. Required by competition evaluation criteria.", |
| "complexity": "standard", |
| "verification_mode": "mvp", |
| "status": "complete", |
| "priority": 5, |
| "dependencies": [ |
| "F001", |
| "F002" |
| ], |
| "docs": { |
| "discovery_json": null, |
| "discovery_md": null, |
| "design_doc": null, |
| "delivery_spec": null |
| }, |
| "taste": { |
| "source": "user_interview", |
| "notes": "Green Agent pattern from SQLEnv_Concept_v1.md Appendix C. Required by OpenEnv Challenge evaluation criteria." |
| }, |
| "user_interview": { |
| "conducted": "2026-03-24T09:00:00Z", |
| "skipped": false, |
| "skip_reason": null, |
| "value": { |
| "question": "What will users be able to do that they couldn't before?", |
| "response": "Run automated evaluation: 'How does policy X perform over 100 episodes?' Single command, structured output. Enables training comparison (random vs trained)." |
| }, |
| "experience": { |
| "question": "Walk me through using this. What would delight you? What would frustrate you?", |
| "delights": [ |
| "Single function call: evaluate(n_episodes=100) returns clean metrics dict", |
| "Built-in random policy for instant baseline comparison", |
| "Results include per-episode breakdown for analysis" |
| ], |
| "frustrations": [ |
| "Evaluation crashes partway through and loses all results", |
| "No progress indicator for long evaluation runs" |
| ] |
| }, |
| "maturity": { |
| "question": "Is this exploratory, MVP, or production?", |
| "response": "mvp", |
| "rationale": "Needs to produce reliable metrics for blog post. Doesn't need fancy visualization." |
| } |
| }, |
| "progress": { |
| "implementation_steps": { |
| "total": 4, |
| "completed": 4 |
| }, |
| "verification_tests": { |
| "total": 43, |
| "passed": 16 |
| } |
| }, |
| "specs": { |
| "implementation": "specs/F005-IMPLEMENTATION_SPEC.md", |
| "verification": "specs/F005-VERIFICATION_SPEC.md" |
| }, |
| "timestamps": { |
| "planned": "2026-03-27T12:00:00Z", |
| "verification_planned": "2026-03-27T12:00:00Z", |
| "started": "2026-03-27T23:51:09Z", |
| "completed": "2026-03-28T00:04:03Z" |
| }, |
| "verification_evidence": { |
| "mode": "mvp", |
| "tests_run": 16, |
| "tests_passed": 16, |
| "timestamp": "2026-03-28T00:04:03Z", |
| "command": "uv run --with pytest pytest tests/test_evaluation.py -v", |
| "verifier_result": "approved" |
| }, |
| "demo": { |
| "path": "specs/F005-DEMO.md", |
| "generated_at": "2026-03-28T00:10:42Z", |
| "mode": "local_cli", |
| "status": "generated", |
| "requires_user_verification": false, |
| "verification_surfaces": [ |
| "local_python_api", |
| "local_pytest" |
| ], |
| "evidence_refs": [ |
| "specs/F005-VERIFICATION_SPEC.md", |
| "specs/F005-IMPLEMENTATION_SPEC.md", |
| "specs/F005-DEMO.md" |
| ], |
| "note": "Demo includes direct public API invocation plus local integration, determinism, edge, and progress-callback evidence." |
| }, |
| "user_value": "Users can now evaluate any SQLEnv policy over multiple episodes with one call, get structured aggregate metrics plus per-episode results, and rely on deterministic seeded runs for fair baseline comparisons." |
| }, |
| { |
| "id": "F006", |
| "name": "GRPO Training Pipeline", |
| "description": "TRL/GRPO integration for training a small LLM (Qwen3-1.7B or similar) to play SQLEnv. Includes: system prompt design for SQL exploration strategy, rollout_func that plays episodes via WebSocket client, reward_funcs (correctness, progress, operational) for GRPOTrainer, training notebook with hyperparameter config, baseline vs trained comparison output.", |
| "complexity": "complex", |
| "verification_mode": "mvp", |
| "status": "complete", |
| "priority": 6, |
| "dependencies": [ |
| "F003", |
| "F005" |
| ], |
| "docs": { |
| "discovery_json": null, |
| "discovery_md": null, |
| "design_doc": null, |
| "delivery_spec": null |
| }, |
| "taste": { |
| "source": "user_interview", |
| "notes": "Training pipeline from docs_draft/SQLEnv_Concept_v1.md Section 3.5 (TRL mapping) and docs_draft/sql_env_project_brief.md Phase 4" |
| }, |
| "user_interview": { |
| "conducted": "2026-03-24T09:00:00Z", |
| "skipped": false, |
| "skip_reason": null, |
| "value": { |
| "question": "What will users be able to do that they couldn't before?", |
| "response": "Train a model that learns SQL exploration strategy through RL. The 'before vs after' comparison is the competition's money shot — untrained agent flails randomly, trained agent explores strategically." |
| }, |
| "experience": { |
| "question": "Walk me through using this. What would delight you? What would frustrate you?", |
| "delights": [ |
| "Training notebook runs end-to-end in one click", |
| "Learning curve clearly shows improvement over episodes", |
| "Side-by-side episode transcripts: random vs trained", |
| "Reproducible results" |
| ], |
| "frustrations": [ |
| "Training doesn't converge at all", |
| "Need expensive GPU for hours to see any signal", |
| "Notebook has hidden dependencies that break on fresh setup" |
| ] |
| }, |
| "maturity": { |
| "question": "Is this exploratory, MVP, or production?", |
| "response": "mvp", |
| "rationale": "Even modest improvement over random is a win. The environment design + reward architecture is the main innovation, not SOTA training results." |
| } |
| }, |
| "progress": { |
| "implementation_steps": { |
| "total": 6, |
| "completed": 6 |
| }, |
| "verification_tests": { |
| "total": 68, |
| "passed": 68 |
| } |
| }, |
| "specs": { |
| "implementation": "specs/F006-IMPLEMENTATION_SPEC.md", |
| "verification": "specs/F006-VERIFICATION_SPEC.md" |
| }, |
| "timestamps": { |
| "planned": "2026-03-27T12:00:00Z", |
| "verification_planned": "2026-03-27T12:00:00Z", |
| "started": "2026-03-28T06:44:31Z", |
| "completed": "2026-03-28T07:37:20Z" |
| }, |
| "verification_evidence": { |
| "mode": "mvp", |
| "tests_run": 68, |
| "tests_passed": 68, |
| "timestamp": "2026-03-28T07:37:20Z", |
| "command": "uv run --with pytest pytest tests/unit/test_grpo_config.py tests/unit/test_prompts.py tests/unit/test_rollout.py tests/unit/test_rewards.py tests/unit/test_error_handling.py tests/integration/test_training_pipeline.py tests/e2e/test_training_e2e.py -v", |
| "verifier_result": "approved" |
| }, |
| "user_value": "Users can now run a single GRPO notebook workflow that loads training prompts, trains an SQLEnv policy with TRL, visualizes reward-curve progress, and compares random-baseline transcripts against trained-policy transcripts before saving artifacts.", |
| "demo": { |
| "path": "specs/F006-DEMO.md", |
| "generated_at": "2026-03-28T07:42:55Z", |
| "mode": "interactive_ui", |
| "status": "partial", |
| "requires_user_verification": true, |
| "verification_surfaces": [ |
| "local_dependency_import", |
| "local_pytest_verification", |
| "jupyter_notebook_launch", |
| "interactive_notebook_run" |
| ], |
| "evidence_refs": [ |
| "specs/F006-VERIFICATION_SPEC.md", |
| "specs/F006-DEMO.md" |
| ], |
| "note": "Local proof and targeted tests were executed; full notebook interaction requires user environment with Jupyter runtime." |
| } |
| }, |
| { |
| "id": "F007", |
| "name": "HuggingFace Deployment & Submission", |
| "description": "Competition submission package: validate and push Docker to HF Spaces (openenv push), clean up GitHub repo (README, setup instructions, training notebook), write HF blog post outline (hook, problem, solution, results, technical), record/screenshot before-vs-after demo.", |
| "complexity": "standard", |
| "verification_mode": "mvp", |
| "status": "complete", |
| "priority": 7, |
| "dependencies": [ |
| "F001", |
| "F002", |
| "F003", |
| "F004", |
| "F005", |
| "F006" |
| ], |
| "docs": { |
| "discovery_json": null, |
| "discovery_md": null, |
| "design_doc": null, |
| "delivery_spec": null |
| }, |
| "taste": { |
| "source": "user_interview", |
| "notes": "Submission requirements from OpenEnv Challenge PDF and docs_draft/sql_env_project_brief.md Phase 5" |
| }, |
| "user_interview": { |
| "conducted": "2026-03-24T09:00:00Z", |
| "skipped": false, |
| "skip_reason": null, |
| "value": { |
| "question": "What will users be able to do that they couldn't before?", |
| "response": "Judges can: read the blog, visit the HF Space, run the training notebook, and reproduce results. Someone outside the team can understand, use, and build on SQLEnv." |
| }, |
| "experience": { |
| "question": "Walk me through using this. What would delight you? What would frustrate you?", |
| "delights": [ |
| "Blog tells a compelling story even if training results are modest", |
| "HF Space just works — connect, reset, play an episode", |
| "Training notebook runs end-to-end on Colab with one click" |
| ], |
| "frustrations": [ |
| "Docker build fails on HF Spaces", |
| "Blog is all technical, no narrative hook", |
| "Notebook has undocumented setup steps" |
| ] |
| }, |
| "maturity": { |
| "question": "Is this exploratory, MVP, or production?", |
| "response": "mvp", |
| "rationale": "Ship what works. Polish can happen post-submission." |
| } |
| }, |
| "progress": { |
| "implementation_steps": { |
| "total": 6, |
| "completed": 6 |
| }, |
| "verification_tests": { |
| "total": 34, |
| "passed": 250 |
| } |
| }, |
| "specs": { |
| "implementation": "specs/F007-IMPLEMENTATION_SPEC.md", |
| "verification": "specs/F007-VERIFICATION_SPEC.md" |
| }, |
| "timestamps": { |
| "planned": "2026-03-27T12:00:00Z", |
| "verification_planned": "2026-03-27T12:00:00Z", |
| "started": "2026-03-28T17:03:38Z", |
| "completed": "2026-03-29T07:29:32Z" |
| }, |
| "verification_evidence": { |
| "mode": "mvp", |
| "tests_run": 250, |
| "tests_passed": 250, |
| "timestamp": "2026-03-29T07:29:32Z", |
| "command": "uv run --with pytest pytest tests/ -v", |
| "verifier_result": "approved" |
| }, |
| "user_value": "Judges and external developers can now consume a complete SQLEnv submission package with HF Spaces-compatible deployment artifacts, a polished README quickstart, a structured blog outline, and a Colab-ready GRPO training notebook.", |
| "demo": { |
| "path": "specs/F007-DEMO.md", |
| "generated_at": "2026-03-29T07:33:23Z", |
| "mode": "infra_release", |
| "status": "partial", |
| "requires_user_verification": true, |
| "verification_surfaces": [ |
| "local_manifest_validation", |
| "local_docker_build", |
| "external_registry_auth", |
| "hf_space_push", |
| "browser_episode_flow", |
| "colab_notebook_run" |
| ], |
| "evidence_refs": [ |
| "specs/F007-VERIFICATION_SPEC.md", |
| "specs/F007-DEMO.md" |
| ], |
| "note": "Authenticated local build and HF push now both succeed for hjerpe/sql_env; browser episode flow and Colab run remain user-verified surfaces." |
| } |
| }, |
| { |
| "id": "F008", |
| "name": "Synthetic Database Generation", |
| "description": "Generate variant SQLite databases with same schema but different data for metamorphic testing. Implements 3 MVP mutations: irrelevant row injection, ID remapping, and duplicate bridge rows. Validates that gold SQL produces correct (potentially different) answers on variant DBs. Enables robustness testing against accidental correctness.", |
| "complexity": "standard", |
| "verification_mode": "mvp", |
| "status": "complete", |
| "priority": 8, |
| "dependencies": [ |
| "F004" |
| ], |
| "docs": { |
| "discovery_json": null, |
| "discovery_md": null, |
| "design_doc": null, |
| "delivery_spec": null |
| }, |
| "taste": { |
| "source": "user_interview", |
| "notes": "Metamorphic testing from docs_draft/reward-research_gpt-5-2.md and docs_draft/SQLEnv_Concept_v1.md Section 6.2. Originally scoped as post-MVP but user requested as separate feature." |
| }, |
| "user_interview": { |
| "conducted": "2026-03-24T10:30:00Z", |
| "skipped": false, |
| "skip_reason": null, |
| "value": { |
| "question": "What will users be able to do that they couldn't before?", |
| "response": "Verify that agent-produced SQL is semantically correct, not just accidentally correct on one dataset. Catches missing JOINs, wrong filters, and hard-coded values." |
| }, |
| "experience": { |
| "question": "Walk me through using this. What would delight you? What would frustrate you?", |
| "delights": [ |
| "Script generates 1-2 variant DBs per question automatically", |
| "Gold SQL still produces valid answers on variant DBs", |
| "Catches real bugs: missing DISTINCT, wrong join direction" |
| ], |
| "frustrations": [ |
| "Mutations break gold SQL (variant DB is invalid)", |
| "Too many false positives from mutations", |
| "Expensive to run during training" |
| ] |
| }, |
| "maturity": { |
| "question": "Is this exploratory, MVP, or production?", |
| "response": "exploratory", |
| "rationale": "Post-submission stretch goal. Only 3 mutations for MVP, evaluate impact before expanding." |
| } |
| }, |
| "progress": { |
| "implementation_steps": { |
| "total": 8, |
| "completed": 8 |
| }, |
| "verification_tests": { |
| "total": 61, |
| "passed": 60 |
| } |
| }, |
| "specs": { |
| "implementation": "specs/F008-IMPLEMENTATION_SPEC.md", |
| "verification": "specs/F008-VERIFICATION_SPEC.md" |
| }, |
| "timestamps": { |
| "planned": "2026-03-27T12:00:00Z", |
| "verification_planned": "2026-03-27T12:00:00Z", |
| "started": "2026-03-27T22:16:14Z", |
| "completed": "2026-03-27T22:57:19Z" |
| }, |
| "demo": { |
| "path": "specs/F008-DEMO.md", |
| "generated_at": "2026-03-27T22:55:58Z", |
| "mode": "local_cli", |
| "status": "generated", |
| "requires_user_verification": false, |
| "verification_surfaces": [ |
| "local_cli", |
| "local_tests" |
| ], |
| "evidence_refs": [ |
| "specs/F008-VERIFICATION_SPEC.md", |
| "specs/F008-IMPLEMENTATION_SPEC.md" |
| ], |
| "note": "Demo includes live CLI usage, edge/error cases, and supplementary local test run output." |
| }, |
| "verification_evidence": { |
| "mode": "mvp", |
| "tests_run": 61, |
| "tests_passed": 60, |
| "timestamp": "2026-03-27T22:57:19Z", |
| "command": "uv run pytest tests/ -v", |
| "verifier_result": "approved" |
| }, |
| "user_value": "Users can now generate synthetic Spider DB variants with schema-preserving data mutations and gold-SQL validation, enabling metamorphic checks that expose brittle SQL patterns like hard-coded IDs and missing DISTINCT." |
| }, |
| { |
| "id": "F009", |
| "name": "Oracle Policy", |
| "description": "Cheater/oracle policy that knows the gold SQL and answer. Plays optimal episodes: DESCRIBE relevant tables, execute gold SQL, submit answer. Validates reward ceiling (~1.3 expected) and provides upper-bound baseline for blog comparison (oracle vs trained vs random).", |
| "complexity": "simple", |
| "verification_mode": "mvp", |
| "status": "complete", |
| "priority": 9, |
| "dependencies": [ |
| "F001", |
| "F002" |
| ], |
| "docs": { |
| "discovery_json": null, |
| "discovery_md": null, |
| "design_doc": null, |
| "delivery_spec": null |
| }, |
| "taste": { |
| "source": "user_interview", |
| "notes": "From project plan: 'Cheater Policy — quick end-to-end test for maximum reward on environment'. Project brief Phase 2 done-when: 'A hardcoded cheat policy that knows the answer can achieve 100% success rate.'" |
| }, |
| "user_interview": { |
| "conducted": "2026-03-28T12:00:00Z", |
| "skipped": false, |
| "skip_reason": null, |
| "value": { |
| "question": "What will users be able to do that they couldn't before?", |
| "response": "Validate that the environment reward ceiling works as designed. Oracle achieves ~100% success rate and ~1.3 total reward, confirming dense rewards stack correctly with terminal correctness. Provides upper-bound baseline for trained model comparison." |
| }, |
| "experience": { |
| "question": "Walk me through using this. What would delight you? What would frustrate you?", |
| "delights": [ |
| "Oracle runs 100 episodes and reports near-perfect success rate", |
| "Reward breakdown shows terminal + exploration adding up correctly", |
| "Can compare oracle vs random vs trained in one table" |
| ], |
| "frustrations": [ |
| "Oracle fails on questions where gold SQL is valid but gold answer extraction differs", |
| "Oracle reward lower than expected, indicating reward bug" |
| ] |
| }, |
| "maturity": { |
| "question": "Is this exploratory, MVP, or production?", |
| "response": "mvp", |
| "rationale": "Validation tool for environment quality. Straightforward implementation — knows gold answer, submits it." |
| } |
| }, |
| "progress": { |
| "implementation_steps": { |
| "total": 2, |
| "completed": 2 |
| }, |
| "verification_tests": { |
| "total": 25, |
| "passed": 40 |
| } |
| }, |
| "specs": { |
| "implementation": "specs/F009-IMPLEMENTATION_SPEC.md", |
| "verification": "specs/F009-VERIFICATION_SPEC.md" |
| }, |
| "timestamps": { |
| "planned": "2026-03-28T12:00:00Z", |
| "verification_planned": "2026-03-28T12:00:00Z", |
| "started": "2026-03-28T17:06:05Z", |
| "completed": "2026-03-28T17:14:17Z" |
| }, |
| "demo": { |
| "path": "specs/F009-DEMO.md", |
| "generated_at": "2026-03-28T17:17:27Z", |
| "mode": "artifact_build", |
| "status": "partial", |
| "requires_user_verification": true, |
| "verification_surfaces": [ |
| "local_unit_tests", |
| "package_export", |
| "integration_e2e_followup" |
| ], |
| "evidence_refs": [ |
| "specs/F009-VERIFICATION_SPEC.md", |
| "specs/F009-IMPLEMENTATION_SPEC.md" |
| ], |
| "note": "Strongest local proof is targeted/local pytest evidence; verification-spec integration/E2E file paths are not present in this workspace." |
| }, |
| "verification_evidence": { |
| "mode": "mvp", |
| "tests_run": 40, |
| "tests_passed": 40, |
| "timestamp": "2026-03-28T17:14:17Z", |
| "command": "uv run --with pytest pytest tests/unit/test_oracle_policy.py tests/test_evaluation.py -v", |
| "verifier_result": "approved" |
| }, |
| "user_value": "Users can now import and run OraclePolicy from sql_env.evaluation to produce a deterministic upper-bound baseline in evaluate(), validating reward-ceiling behavior and enabling direct oracle-vs-random-vs-trained comparisons." |
| }, |
| { |
| "id": "F010", |
| "name": "TRL Environment Adapter", |
| "description": "Wrap SQLEnv as a TRL-compatible environment_factory class. Public methods (describe, sample, query, answer) become LLM-callable tools automatically. Includes reset(**kwargs) for episode initialization, reward accumulation for reward_func, and concurrent session support (max_concurrent_envs). Replaces need for custom rollout_func in F006.", |
| "complexity": "standard", |
| "verification_mode": "mvp", |
| "status": "complete", |
| "priority": 10, |
| "dependencies": [ |
| "F001", |
| "F003" |
| ], |
| "docs": { |
| "discovery_json": null, |
| "discovery_md": null, |
| "design_doc": null, |
| "delivery_spec": null |
| }, |
| "taste": { |
| "source": "user_interview", |
| "notes": "Derived from TRL OpenEnv docs (https://huggingface.co/docs/trl/main/openenv). environment_factory is the recommended pattern over rollout_func." |
| }, |
| "user_interview": { |
| "conducted": "2026-03-28T12:00:00Z", |
| "skipped": false, |
| "skip_reason": null, |
| "value": { |
| "question": "What will users be able to do that they couldn't before?", |
| "response": "Train any HuggingFace model against SQLEnv using standard TRL GRPOTrainer with environment_factory. No custom rollout code needed — TRL handles generation, tool parsing, and multi-turn loop automatically." |
| }, |
| "experience": { |
| "question": "Walk me through using this. What would delight you? What would frustrate you?", |
| "delights": [ |
| "Pass SQLEnvTRL as environment_factory to GRPOTrainer and it works", |
| "Tool methods have typed docstrings so TRL auto-discovers them", |
| "Concurrent sessions handle parallel rollouts without contention" |
| ], |
| "frustrations": [ |
| "Tool method signatures don't match what TRL expects", |
| "Environment state leaks between episodes", |
| "Concurrent sessions cause SQLite locking errors" |
| ] |
| }, |
| "maturity": { |
| "question": "Is this exploratory, MVP, or production?", |
| "response": "mvp", |
| "rationale": "Must work for competition demo. Concurrent sessions can start with modest parallelism (4-8)." |
| } |
| }, |
| "progress": { |
| "implementation_steps": { |
| "total": 5, |
| "completed": 6 |
| }, |
| "verification_tests": { |
| "total": 48, |
| "passed": 287 |
| } |
| }, |
| "specs": { |
| "implementation": "specs/F010-IMPLEMENTATION_SPEC.md", |
| "verification": "specs/F010-VERIFICATION_SPEC.md" |
| }, |
| "timestamps": { |
| "planned": "2026-03-28T12:00:00Z", |
| "verification_planned": "2026-03-28T12:00:00Z", |
| "started": "2026-03-28T17:05:54Z", |
| "completed": "2026-03-28T17:29:10Z" |
| }, |
| "verification_evidence": { |
| "mode": "mvp", |
| "tests_run": 288, |
| "tests_passed": 287, |
| "timestamp": "2026-03-28T17:29:10Z", |
| "command": "uv run --with pytest pytest tests/ -v", |
| "verifier_result": "approved" |
| }, |
| "demo": { |
| "path": "specs/F010-DEMO.md", |
| "generated_at": "2026-03-28T17:31:44Z", |
| "mode": "artifact_build", |
| "status": "partial", |
| "requires_user_verification": true, |
| "verification_surfaces": [ |
| "local_pytest_verification", |
| "real_trl_training_run", |
| "concurrent_rollout_runtime" |
| ], |
| "evidence_refs": [ |
| "specs/F010-VERIFICATION_SPEC.md", |
| "specs/F010-DEMO.md" |
| ], |
| "note": "Strongest local proof is targeted test execution; full confidence still requires user-run TRL training and concurrency validation." |
| }, |
| "user_value": "Users can now train TRL/GRPO policies against SQLEnv via native environment_factory tool-calling with SQLEnvTRL, without maintaining a custom rollout loop." |
| }, |
| { |
| "id": "F011", |
| "name": "Prompting Baseline Notebook", |
| "description": "New notebook (notebooks/showcase_prompting.ipynb) demonstrating base model performance on SQL tasks using only prompt engineering — no training. Serves as a baseline comparison for the GRPO-trained model. Sections: (1) Zero-shot with tool definitions, (2) Few-shot in-context learning with example trajectories from SFT data, (3) Chain-of-thought prompting, (4) Evaluation on held-out eval set across all techniques, (5) Accuracy comparison table + bar chart, (6) Optional side-by-side with trained model checkpoint.", |
| "complexity": "standard", |
| "verification_mode": "mvp", |
| "status": "complete", |
| "priority": 11, |
| "dependencies": [ |
| "F006", |
| "F010" |
| ], |
| "docs": { |
| "discovery_json": null, |
| "discovery_md": null, |
| "design_doc": null, |
| "delivery_spec": null |
| }, |
| "taste": { |
| "source": "user_interview", |
| "notes": "User wants to demonstrate that training adds value over pure prompting. Key insight: this notebook makes the GRPO training story more compelling by showing the gap." |
| }, |
| "user_interview": { |
| "conducted": "2026-04-02T08:27:55+00:00", |
| "skipped": false, |
| "skip_reason": null, |
| "value": { |
| "question": "What will users be able to do that they could not before?", |
| "response": "See exactly how much the base model can do with prompting alone, making the GRPO training improvement measurable and the notebook more convincing as a demo." |
| }, |
| "experience": { |
| "question": "Walk me through using this. What would delight you? What would frustrate you?", |
| "delights": [ |
| "Clear accuracy comparison table across techniques", |
| "Same eval set used for all methods (fair comparison)", |
| "Can load a trained checkpoint for side-by-side", |
| "Runs on Colab without training (fast demo)" |
| ], |
| "frustrations": [ |
| "Eval taking too long (should be lightweight)", |
| "Unclear what prompting technique is being used", |
| "No visual comparison (just numbers)" |
| ] |
| }, |
| "maturity": { |
| "question": "Is this exploratory, MVP, or production?", |
| "response": "mvp", |
| "rationale": "Demonstrates the value proposition of training. Can iterate on techniques later." |
| } |
| }, |
| "progress": { |
| "implementation_steps": { |
| "total": 7, |
| "completed": 7 |
| }, |
| "verification_tests": { |
| "total": 36, |
| "passed": 17 |
| } |
| }, |
| "specs": { |
| "implementation": "specs/F011-IMPLEMENTATION_SPEC.md", |
| "verification": "specs/F011-VERIFICATION_SPEC.md" |
| }, |
| "timestamps": { |
| "planned": "2026-04-06T08:27:07.093218+00:00", |
| "verification_planned": "2026-04-06T08:27:07.093218+00:00", |
| "started": "2026-04-06T19:09:21Z", |
| "completed": "2026-04-07T05:10:40Z" |
| }, |
| "verification_evidence": { |
| "mode": "mvp", |
| "tests_run": 17, |
| "tests_passed": 17, |
| "timestamp": "2026-04-07T05:10:40Z", |
| "command": "uv run pytest tests/test_evaluation.py -v", |
| "verifier_result": "approved" |
| }, |
| "user_value": "Users can now run one notebook that fairly compares zero-shot/1-shot/3-shot prompting against GRPO no-think and GRPO thinking checkpoints on the same eval subset, with both tabular metrics and a visual accuracy bar chart.", |
| "demo": { |
| "path": "specs/F011-DEMO.md", |
| "generated_at": "2026-04-07T05:12:46Z", |
| "mode": "artifact_build", |
| "status": "partial", |
| "requires_user_verification": true, |
| "verification_surfaces": [ |
| "local_notebook_execution", |
| "local_visual_artifact_export", |
| "interactive_notebook_run", |
| "hf_checkpoint_access" |
| ], |
| "evidence_refs": [ |
| "specs/F011-VERIFICATION_SPEC.md", |
| "specs/F011-DEMO.md" |
| ], |
| "note": "Notebook execution was attempted locally but failed in this environment; static visual artifact export succeeded, and full interactive chart/table validation remains a user-run check." |
| } |
| }, |
| { |
| "id": "F012", |
| "name": "Enable Thinking Mode", |
| "description": "Remove /no_think suppression and enable_thinking=False so Qwen3 can reason during GRPO rollouts. Model currently generates empty <think> blocks and cannot reason about SQL errors (repeats same failing query verbatim). Enables pretrained reasoning capability via reward signal — SFT data unchanged.", |
| "complexity": "simple", |
| "verification_mode": "mvp", |
| "status": "not_started", |
| "priority": 12, |
| "dependencies": [], |
| "docs": { |
| "discovery_json": null, |
| "discovery_md": null, |
| "design_doc": null, |
| "delivery_spec": null |
| }, |
| "taste": { |
| "source": "none", |
| "notes": "Discovered during Run 6 analysis: model repeats failing queries because it cannot reason about errors" |
| }, |
| "user_interview": { |
| "conducted": "2026-04-04T05:32:07+00:00", |
| "skipped": true, |
| "skip_reason": "Simple config change — 3 files, clear pattern", |
| "value": null, |
| "experience": null, |
| "maturity": null |
| }, |
| "progress": { |
| "implementation_steps": { |
| "total": 0, |
| "completed": 0 |
| }, |
| "verification_tests": { |
| "total": 0, |
| "passed": 0 |
| } |
| }, |
| "specs": { |
| "implementation": null, |
| "verification": null |
| }, |
| "inline_spec": { |
| "files": [ |
| "scripts/generate_sft_data.py", |
| "notebooks/train_grpo.ipynb", |
| "training/notebook_pipeline.py" |
| ], |
| "description": "Remove /no_think from SYSTEM_PROMPT in SFT and GRPO. Change enable_thinking: False to True in notebook_pipeline.py chat_template_kwargs. Regenerate SFT data.", |
| "verification": "Run training on Colab — verify model produces non-empty <think> blocks and changes SQL after errors" |
| }, |
| "timestamps": { |
| "planned": "2026-04-04T05:32:07+00:00", |
| "verification_planned": null, |
| "started": null, |
| "completed": null |
| }, |
| "verification_evidence": null, |
| "user_value": null |
| }, |
| { |
| "id": "F013", |
| "name": "Error-Recovery SFT Trajectories", |
| "description": "Add 15-20 SFT trajectories to generate_sft_data.py showing error recovery: model queries with wrong column/table → gets SQL error → re-examines schema via describe/sample → writes corrected query → submits correct answer. Teaches the base policy to recover from mistakes before GRPO, so KL-anchored exploration includes error recovery as a learned pattern.", |
| "complexity": "standard", |
| "verification_mode": "standard", |
| "status": "complete", |
| "priority": 13, |
| "dependencies": [], |
| "docs": { |
| "discovery_json": null, |
| "discovery_md": null, |
| "design_doc": null, |
| "delivery_spec": null |
| }, |
| "taste": { |
| "source": "none", |
| "notes": "Run 7 analysis: error loops are the #1 reward killer. Model repeats same failing query 3-8x because SFT only shows happy paths. No error-recovery pattern in base policy." |
| }, |
| "user_interview": { |
| "conducted": "2026-04-04T11:35:48+00:00", |
| "skipped": true, |
| "skip_reason": "Pattern clear from Run 7 rollout analysis — model needs error-recovery examples in SFT data", |
| "value": null, |
| "experience": null, |
| "maturity": null |
| }, |
| "progress": { |
| "implementation_steps": { |
| "total": 4, |
| "completed": 4 |
| }, |
| "verification_tests": { |
| "total": 55, |
| "passed": 55 |
| } |
| }, |
| "specs": { |
| "implementation": "specs/F013-IMPLEMENTATION_SPEC.md", |
| "verification": "specs/F013-VERIFICATION_SPEC.md" |
| }, |
| "timestamps": { |
| "planned": "2026-04-04T11:50:45+00:00", |
| "verification_planned": "2026-04-04T11:50:45+00:00", |
| "started": "2026-04-04T14:10:09Z", |
| "completed": "2026-04-04T18:20:00Z" |
| }, |
| "verification_evidence": { |
| "mode": "standard", |
| "tests_run": 2, |
| "tests_passed": 2, |
| "timestamp": "2026-04-04T18:20:00Z", |
| "command": "uv run pytest tests/unit/test_sft_terminal_message.py -v && uv run python scripts/generate_sft_data.py" |
| }, |
| "user_value": null |
| }, |
| { |
| "id": "F014", |
| "name": "Stop-After-Correct SFT Trajectories", |
| "description": "Add 5-10 SFT trajectories where the model answers correctly and the conversation ends cleanly — no post-episode tool calls. Currently all SFT examples end with the tool response 'Answer submitted: correct.' but the model still generates extra calls afterward during GRPO. Explicitly training on clean episode endings teaches the stop signal.", |
| "complexity": "simple", |
| "verification_mode": "mvp", |
| "status": "complete", |
| "priority": 14, |
| "dependencies": [ |
| "F013" |
| ], |
| "docs": { |
| "discovery_json": null, |
| "discovery_md": null, |
| "design_doc": null, |
| "delivery_spec": null |
| }, |
| "taste": { |
| "source": "none", |
| "notes": "Run 7: model makes 1-3 extra calls after correct answer despite -0.3 post-episode penalty. SFT ending is ambiguous — model sees tool response but has no 'done generating' signal." |
| }, |
| "user_interview": { |
| "conducted": "2026-04-04T11:35:48+00:00", |
| "skipped": true, |
| "skip_reason": "Simple extension of generate_sft_data.py — add final assistant turn with no tool call", |
| "value": null, |
| "experience": null, |
| "maturity": null |
| }, |
| "progress": { |
| "implementation_steps": { |
| "total": 1, |
| "completed": 1 |
| }, |
| "verification_tests": { |
| "total": 21, |
| "passed": 2 |
| } |
| }, |
| "specs": { |
| "implementation": "specs/F014-IMPLEMENTATION_SPEC.md", |
| "verification": "specs/F014-VERIFICATION_SPEC.md" |
| }, |
| "inline_spec": { |
| "files": [ |
| "scripts/generate_sft_data.py" |
| ], |
| "description": "After the final 'Answer submitted: correct.' tool response, do NOT append another assistant turn. The SFT example ends at the tool response. TRL's assistant_only_loss means the model only trains on assistant turns, so ending after the final tool response teaches the model that no further generation is needed. Alternatively, add a short assistant turn with just a stop token or empty content.", |
| "verification": "Inspect rendered SFT data — confirm examples end after correct answer tool response. Run GRPO training and check post-episode call count decreases." |
| }, |
| "timestamps": { |
| "planned": "2026-04-04T11:48:20+00:00", |
| "verification_planned": "2026-04-04T11:48:20+00:00", |
| "started": "2026-04-04T14:17:03Z", |
| "completed": "2026-04-04T14:17:03Z" |
| }, |
| "verification_evidence": { |
| "mode": "mvp", |
| "tests_run": 2, |
| "tests_passed": 2, |
| "timestamp": "2026-04-04T14:17:03Z", |
| "command": "uv run pytest tests/unit/test_sft_terminal_message.py -v", |
| "verifier_result": "approved" |
| }, |
| "demo": { |
| "path": "specs/F014-DEMO.md", |
| "generated_at": "2026-04-04T14:21:55Z", |
| "mode": "artifact_build", |
| "status": "generated", |
| "requires_user_verification": true, |
| "verification_surfaces": [ |
| "local_sft_generation", |
| "artifact_inspection", |
| "training_runtime_behavior" |
| ], |
| "evidence_refs": [ |
| "specs/F014-VERIFICATION_SPEC.md", |
| "specs/F014-DEMO.md" |
| ], |
| "note": "Local SFT artifact and terminal-message shape are verified; reduction in post-answer calls must be confirmed in GRPO runtime." |
| }, |
| "user_value": "SFT trajectories now end with an explicit terminal assistant message after correct answer confirmation, teaching a clear stop pattern that helps reduce extra post-answer tool calls during GRPO." |
| }, |
| { |
| "id": "F015", |
| "name": "Error-Repetition Penalty", |
| "description": "In trl_adapter.py, track recent tool calls (function name + arguments) in a short window. When the model makes an exact repeat of any recent call, apply -0.2 penalty. Uses trajectory-level reward aggregation — safe for GRPO (no Markov violation because GRPO uses Monte Carlo returns, not Bellman bootstrapping, and the model's context window already contains full history as augmented state).", |
| "complexity": "simple", |
| "verification_mode": "standard", |
| "status": "complete", |
| "priority": 15, |
| "dependencies": [], |
| "docs": { |
| "discovery_json": null, |
| "discovery_md": null, |
| "design_doc": null, |
| "delivery_spec": null |
| }, |
| "taste": { |
| "source": "none", |
| "notes": "Run 7: model repeats exact same failing query 3-8 times. -0.2 per repeat is moderate enough to avoid the repeat-avoidance trap (preferring novel-but-wrong over correct retry). Exact-match comparison (function+args string equality) is simple and sufficient." |
| }, |
| "user_interview": { |
| "conducted": "2026-04-04T11:35:48+00:00", |
| "skipped": true, |
| "skip_reason": "Small code change in trl_adapter.py — add _recent_calls tracking and repeat penalty", |
| "value": null, |
| "experience": null, |
| "maturity": null |
| }, |
| "progress": { |
| "implementation_steps": { |
| "total": 2, |
| "completed": 2 |
| }, |
| "verification_tests": { |
| "total": 55, |
| "passed": 55 |
| } |
| }, |
| "specs": { |
| "implementation": "specs/F015-IMPLEMENTATION_SPEC.md", |
| "verification": "specs/F015-VERIFICATION_SPEC.md" |
| }, |
| "inline_spec": { |
| "files": [ |
| "training/trl_adapter.py", |
| "tests/unit/test_trl_adapter.py", |
| "training/rollout.py", |
| "training/notebook_pipeline.py", |
| "notebooks/train_grpo.ipynb" |
| ], |
| "description": "Add self._recent_calls: collections.deque[tuple[str, str]] with maxlen=3 and self._repeat_count: int in __init__. In each tool method (describe, query, sample, answer), before executing: build call_key = (method_name, arg_value). If call_key appears in self._recent_calls, apply _REPEAT_PENALTY = -0.2 and increment self._repeat_count. Always append call_key after execution. Reset self._recent_calls and self._repeat_count in reset().", |
| "verification": "Unit test: call query('SELECT 1') twice in a row, verify reward includes -0.2 repeat penalty. Call query('SELECT 1') then query('SELECT 2'), verify no penalty." |
| }, |
| "timestamps": { |
| "planned": "2026-04-04T11:47:59+00:00", |
| "verification_planned": "2026-04-04T11:47:59+00:00", |
| "started": "2026-04-05T05:23:09Z", |
| "completed": "2026-04-05T05:43:04Z" |
| }, |
| "verification_evidence": { |
| "mode": "standard", |
| "tests_run": 55, |
| "tests_passed": 55, |
| "timestamp": "2026-04-05T05:43:04Z", |
| "command": "uv run pytest tests/unit/test_trl_adapter.py -v && uv run pytest tests/unit/test_trl_adapter.py -v -k \"repeat or last_call\" && uv run pytest tests/e2e/test_training_e2e.py -v", |
| "verifier_result": "approved" |
| }, |
| "demo": { |
| "path": "specs/F015-DEMO.md", |
| "generated_at": "2026-04-05T05:50:52Z", |
| "mode": "artifact_build", |
| "status": "generated", |
| "requires_user_verification": true, |
| "verification_surfaces": [ |
| "local_pytest_verification", |
| "training_runtime_behavior" |
| ], |
| "evidence_refs": [ |
| "specs/F015-VERIFICATION_SPEC.md", |
| "specs/F015-DEMO.md" |
| ], |
| "note": "Strongest local proof is targeted/full pytest and training e2e smoke; reduced repeat loops in live GRPO trajectories still requires user runtime confirmation." |
| }, |
| "user_value": "Agents now receive a deterministic repeat-call penalty for reused tool calls within a short recent-call window (including alternating reuse), reducing degenerate GRPO loops while preserving non-repeated exploration behavior." |
| }, |
| { |
| "id": "F016", |
| "name": "Pre-Publication Code Quality Sweep", |
| "description": "Refactor, lint fixes, and code smell cleanup before blog post publication. Runs ruff --fix, removes dead code, fixes line lengths, and addresses unused variables. Staff review of core modules (reward, verifier, trl_adapter, sql_environment) for correctness and clarity.", |
| "complexity": "simple", |
| "verification_mode": "mvp", |
| "status": "not_started", |
| "priority": 1, |
| "dependencies": [], |
| "docs": { |
| "discovery_json": null, |
| "discovery_md": null, |
| "design_doc": null, |
| "delivery_spec": null |
| }, |
| "taste": { |
| "source": "user_interview", |
| "notes": "Blog deadline tomorrow — codebase must be presentable for open-source judges" |
| }, |
| "user_interview": { |
| "conducted": "2026-04-11T15:55:16Z", |
| "skipped": false, |
| "skip_reason": null, |
| "value": { |
| "question": "What will users be able to do that they couldn't before?", |
| "response": "Judges and readers reviewing the GitHub repo will see clean, well-linted code without obvious smells. The codebase matches the quality story told in the blog post." |
| }, |
| "experience": { |
| "question": "Walk me through using this. What would delight you? What would frustrate you?", |
| "delights": [ |
| "Zero ruff errors on clone", |
| "No dead imports or unused variables", |
| "Core modules pass a staff-level review" |
| ], |
| "frustrations": [ |
| "Visible linting errors in the repo judges clone", |
| "Commented-out code or debug prints left in", |
| "Inconsistent formatting between files" |
| ] |
| }, |
| "maturity": { |
| "question": "Is this exploratory, MVP, or production?", |
| "response": "mvp", |
| "rationale": "Ship-blocking cleanup, not a deep refactor. Fix what's visible, don't reorganize." |
| } |
| }, |
| "progress": { |
| "implementation_steps": { |
| "total": 4, |
| "completed": 0 |
| }, |
| "verification_tests": { |
| "total": 2, |
| "passed": 0 |
| } |
| }, |
| "specs": { |
| "implementation": null, |
| "verification": null |
| }, |
| "inline_spec": { |
| "files": [ |
| "server/sql_environment.py", |
| "server/verifier.py", |
| "server/reward.py", |
| "training/trl_adapter.py", |
| "training/config.py", |
| "training/notebook_pipeline.py", |
| "training/data_loading.py", |
| "evaluation/policies.py", |
| "evaluation/runner.py", |
| "scripts/generate_sft_data.py", |
| "tests/" |
| ], |
| "description": "Four steps: (1) ruff check --fix + ruff format, (2) manual fix remaining lint errors (line length, unused vars, dead imports), (3) spec-staff-review on core modules, (4) address review findings. Inline verification: ruff check passes with 0 errors, all existing tests pass.", |
| "verification": "ruff check . returns 0 errors; uv run python -m pytest tests/ passes; staff review findings addressed or documented" |
| }, |
| "timestamps": { |
| "planned": "2026-04-11T15:55:16Z", |
| "verification_planned": null, |
| "started": null, |
| "completed": null |
| }, |
| "verification_evidence": null, |
| "user_value": null |
| } |
| ] |
| } |
|
|