sql_env / specs /FEATURES.json
hjerpe's picture
Upload folder using huggingface_hub
9e64e71 verified
{
"$schema": "./schemas/autocode-features-v1.schema.json",
"project": "SQLEnv - Interactive Database Query RL Environment",
"description": "OpenEnv Challenge submission: RL environment where agents learn to answer NL questions about databases through iterative SQL exploration",
"created": "2026-03-24T07:15:50Z",
"updated": "2026-04-11T15:55:16Z",
"features": [
{
"id": "F001",
"name": "Core Environment Loop",
"description": "Complete the step/reset lifecycle: remove Ollama from environment, accept structured actions (DESCRIBE table_name, SAMPLE table_name, QUERY sql_string, ANSWER value), wire up SQLite execution with sandboxing (read-only, 5s timeout, SELECT-only), load questions from JSON on reset(), enforce step budget (15 steps), handle episode termination",
"complexity": "complex",
"verification_mode": "standard",
"status": "complete",
"priority": 1,
"dependencies": [],
"docs": {
"discovery_json": null,
"discovery_md": null,
"design_doc": null,
"delivery_spec": null
},
"taste": {
"source": "user_interview",
"notes": "Derived from docs_draft/sql_env_project_brief.md and docs_draft/SQLEnv_Concept_v1.md — the v1 spec defines the action space, episode lifecycle, and sandboxing requirements"
},
"user_interview": {
"conducted": "2026-03-24T09:00:00Z",
"skipped": false,
"skip_reason": null,
"value": {
"question": "What will users be able to do that they couldn't before?",
"response": "Agents can play complete episodes: reset with a random question, explore a hidden schema via DESCRIBE/SAMPLE, run SQL queries, and submit answers. Currently SQL never executes — this makes the environment actually functional."
},
"experience": {
"question": "Walk me through using this. What would delight you? What would frustrate you?",
"delights": [
"Agent sends DESCRIBE employees and immediately sees column names and types",
"Queries execute in <100ms with clean truncated output (max 20 rows)",
"Bad SQL returns a clear error message the agent can learn from",
"Episode ends cleanly when budget exhausted or ANSWER submitted"
],
"frustrations": [
"Environment calling Ollama to interpret actions (current design) — agent should own reasoning, env should just execute",
"Queries hanging or crashing the environment",
"Opaque error messages that don't help the agent adjust"
]
},
"maturity": {
"question": "Is this exploratory, MVP, or production?",
"response": "mvp",
"rationale": "Competition submission — needs to work reliably for demo and training, not at production scale"
}
},
"progress": {
"implementation_steps": {
"total": 8,
"completed": 8
},
"verification_tests": {
"total": 86,
"passed": 25
}
},
"specs": {
"implementation": "specs/F001-IMPLEMENTATION_SPEC.md",
"verification": "specs/F001-VERIFICATION_SPEC.md"
},
"timestamps": {
"planned": "2026-03-24T10:30:00Z",
"verification_planned": "2026-03-24T10:30:00Z",
"started": "2026-03-24T19:22:08Z",
"completed": "2026-03-24T21:27:31Z"
},
"verification_evidence": {
"mode": "standard",
"tests_run": 25,
"tests_passed": 25,
"timestamp": "2026-03-24T21:27:31Z",
"command": "uv run pytest tests/ -v",
"verifier_result": "approved"
},
"demo": {
"path": "specs/F001-DEMO.md",
"generated_at": "2026-03-24T21:36:32Z",
"mode": "local_cli",
"status": "partial",
"requires_user_verification": true,
"verification_surfaces": [
"local_server_startup",
"data_provisioning",
"api_episode_flow"
],
"evidence_refs": [
"specs/F001-VERIFICATION_SPEC.md",
"specs/F001-DEMO.md"
],
"note": "Local server and tests verified; end-to-end API episode flow requires local Spider DB provisioning."
},
"user_value": "Agents can now run complete SQL exploration episodes end-to-end with structured DESCRIBE/SAMPLE/QUERY/ANSWER actions, live read-only SQLite execution, clear error feedback, and clean terminal completion on ANSWER or budget exhaustion."
},
{
"id": "F002",
"name": "Answer Verification",
"description": "Multi-type answer comparison: integer (exact match), float (1% tolerance), string (case-insensitive normalized), list (order-insensitive set comparison). Implements verify_answer() in server/verifier.py. Returns binary correctness for terminal reward.",
"complexity": "standard",
"verification_mode": "standard",
"status": "complete",
"priority": 2,
"dependencies": [
"F001"
],
"docs": {
"discovery_json": null,
"discovery_md": null,
"design_doc": null,
"delivery_spec": null
},
"taste": {
"source": "user_interview",
"notes": "Answer type handling defined in docs_draft/SQLEnv_Concept_v1.md Section 4.2"
},
"user_interview": {
"conducted": "2026-03-24T09:00:00Z",
"skipped": false,
"skip_reason": null,
"value": {
"question": "What will users be able to do that they couldn't before?",
"response": "When an agent submits ANSWER, the environment correctly determines if the answer matches the gold answer regardless of type (42 vs 42.0, 'Engineering' vs 'engineering', unordered lists)."
},
"experience": {
"question": "Walk me through using this. What would delight you? What would frustrate you?",
"delights": [
"Float comparison with tolerance handles rounding gracefully (95000.1 matches 95000)",
"List comparison ignores order: ['A','B'] matches ['B','A']",
"Clear pass/fail with no ambiguity"
],
"frustrations": [
"Correct answer rejected due to trivial formatting difference",
"Type coercion failures (agent says '42', gold is integer 42)"
]
},
"maturity": {
"question": "Is this exploratory, MVP, or production?",
"response": "mvp",
"rationale": "Must handle the 4 core answer types reliably. Table comparison can come later."
}
},
"progress": {
"implementation_steps": {
"total": 4,
"completed": 4
},
"verification_tests": {
"total": 65,
"passed": 65
}
},
"specs": {
"implementation": "specs/F002-IMPLEMENTATION_SPEC.md",
"verification": "specs/F002-VERIFICATION_SPEC.md"
},
"timestamps": {
"planned": "2026-03-27T12:00:00Z",
"verification_planned": "2026-03-27T12:00:00Z",
"started": "2026-03-27T22:18:15Z",
"completed": "2026-03-27T22:33:12Z"
},
"verification_evidence": {
"mode": "standard",
"tests_run": 65,
"tests_passed": 65,
"timestamp": "2026-03-27T22:33:12Z",
"command": "uv run pytest tests/ -v",
"verifier_result": "approved"
},
"demo": {
"path": "specs/F002-DEMO.md",
"generated_at": "2026-03-27T22:37:50Z",
"mode": "artifact_build",
"status": "partial",
"requires_user_verification": true,
"verification_surfaces": [
"local_pytest_verification",
"runtime_episode_scoring"
],
"evidence_refs": [
"specs/F002-VERIFICATION_SPEC.md",
"specs/F002-DEMO.md"
],
"note": "Strongest local proof is targeted and integration pytest evidence; final runtime confirmation remains a user-operated episode check."
},
"user_value": "Agents can now submit ANSWER values across integer, float, string, and list questions and receive correct terminal scoring despite formatting differences, numeric representation differences, and list order changes."
},
{
"id": "F003",
"name": "Dense Reward System",
"description": "3-layer reward architecture: Layer 1 (operational validity: exec_ok +0.02, new_info +0.01 capped at 0.10, repeat -0.01, step_cost -0.005), Layer 2 (progress-to-target: weighted average of cardinality matching + value overlap + numeric range proximity, binned to 5 levels, improvement-only), Layer 3 (terminal correctness: +1.0 or 0.0). Total step rewards capped at 0.5, negative floor at -0.2.",
"complexity": "complex",
"verification_mode": "standard",
"status": "complete",
"priority": 3,
"dependencies": [
"F001",
"F002"
],
"docs": {
"discovery_json": null,
"discovery_md": null,
"design_doc": null,
"delivery_spec": null
},
"taste": {
"source": "user_interview",
"notes": "Reward architecture defined in docs_draft/SQLEnv_Concept_v1.md Section 3 and docs_draft/reward-research_gpt-5-2.md. Distance metrics detailed in docs_draft/reward_design.md."
},
"user_interview": {
"conducted": "2026-03-24T09:00:00Z",
"skipped": false,
"skip_reason": null,
"value": {
"question": "What will users be able to do that they couldn't before?",
"response": "Agents get meaningful feedback during exploration — not just 0/1 at the end. A query that returns 40 when the answer is 42 gets partial credit. Discovering new schema info gets a small reward. This makes GRPO training converge."
},
"experience": {
"question": "Walk me through using this. What would delight you? What would frustrate you?",
"delights": [
"Reward varies meaningfully: random exploration ~0.1, targeted queries ~0.3, correct answer ~1.3",
"Anti-gaming works: agent can't farm rewards by describing everything or repeating queries",
"Progress signal is coarsened to prevent reward hill-climbing"
],
"frustrations": [
"Reward hacking: agent learns to exploit shaping rather than solve the task",
"Reward too sparse: agent gets no signal until terminal step",
"Over-complex reward that's hard to debug"
]
},
"maturity": {
"question": "Is this exploratory, MVP, or production?",
"response": "mvp",
"rationale": "Start with weighted average of 3 metrics (cardinality, value overlap, numeric range). Add complexity only if training shows issues."
}
},
"progress": {
"implementation_steps": {
"total": 7,
"completed": 7
},
"verification_tests": {
"total": 61,
"passed": 166
}
},
"specs": {
"implementation": "specs/F003-IMPLEMENTATION_SPEC.md",
"verification": "specs/F003-VERIFICATION_SPEC.md"
},
"timestamps": {
"planned": "2026-03-27T12:00:00Z",
"verification_planned": "2026-03-27T12:00:00Z",
"started": "2026-03-27T23:51:47Z",
"completed": "2026-03-28T06:05:02Z"
},
"verification_evidence": {
"mode": "standard",
"tests_run": 166,
"tests_passed": 166,
"timestamp": "2026-03-28T06:05:02Z",
"command": "uv run --with pytest pytest tests/ -v",
"verifier_result": "approved"
},
"demo": {
"path": "specs/F003-DEMO.md",
"generated_at": "2026-03-28T06:07:34Z",
"mode": "artifact_build",
"status": "generated",
"requires_user_verification": true,
"verification_surfaces": [
"local_pytest_verification",
"runtime_episode_flow"
],
"evidence_refs": [
"specs/F003-VERIFICATION_SPEC.md",
"specs/F003-DEMO.md"
],
"note": "Strongest local proof is targeted smoke/unit execution; full reward calibration and live episode behavior should be confirmed in a user-run episode/training context."
},
"user_value": "Agents now receive dense numeric rewards on every non-terminal DESCRIBE/SAMPLE/QUERY step based on execution quality and progress toward the gold answer, while terminal correctness still dominates total episode reward."
},
{
"id": "F004",
"name": "Question Dataset Expansion",
"description": "Expand from 53 questions (one DB) to 100+ questions across 5-10 Spider databases. Add difficulty labels (easy/medium/hard at 40/40/20 split), answer_type metadata, and gold_answer fields. Create train/eval split (70/30). Curate for diversity of answer types and SQL patterns.",
"complexity": "standard",
"verification_mode": "mvp",
"status": "complete",
"priority": 4,
"dependencies": [],
"docs": {
"discovery_json": null,
"discovery_md": null,
"design_doc": null,
"delivery_spec": null
},
"taste": {
"source": "user_interview",
"notes": "Dataset requirements from docs_draft/sql_env_project_brief.md Section 3 and SQLEnv_Concept_v1.md Section 4"
},
"user_interview": {
"conducted": "2026-03-24T09:00:00Z",
"skipped": false,
"skip_reason": null,
"value": {
"question": "What will users be able to do that they couldn't before?",
"response": "Training on diverse databases and question types. Current single-DB setup risks overfitting to one schema."
},
"experience": {
"question": "Walk me through using this. What would delight you? What would frustrate you?",
"delights": [
"Clear difficulty progression: easy questions have 1-2 tables, hard ones have 5+",
"Each question has pre-computed gold_answer so reward doesn't need to re-execute gold SQL every episode",
"Train/eval split prevents training on evaluation data"
],
"frustrations": [
"Questions that require SQL features SQLite doesn't support",
"Ambiguous gold answers (multiple valid interpretations)",
"All questions from same domain = no generalization"
]
},
"maturity": {
"question": "Is this exploratory, MVP, or production?",
"response": "mvp",
"rationale": "100 well-curated questions is sufficient for competition demo. Quality over quantity."
}
},
"progress": {
"implementation_steps": {
"total": 6,
"completed": 6
},
"verification_tests": {
"total": 66,
"passed": 21
}
},
"specs": {
"implementation": "specs/F004-IMPLEMENTATION_SPEC.md",
"verification": "specs/F004-VERIFICATION_SPEC.md"
},
"demo": {
"path": "specs/F004-DEMO.md",
"generated_at": "2026-03-24T21:07:31Z"
},
"timestamps": {
"planned": "2026-03-24T10:30:00Z",
"verification_planned": "2026-03-24T10:30:00Z",
"started": "2026-03-24T16:53:35Z",
"completed": "2026-03-24T21:04:54Z"
},
"verification_evidence": {
"mode": "mvp",
"tests_run": 21,
"tests_passed": 21,
"timestamp": "2026-03-24T21:04:54Z",
"command": "uv run pytest tests/ -v",
"verifier_result": "approved"
},
"user_value": "Users can now train and evaluate against a curated multi-database dataset (676 questions across 10 Spider databases) with precomputed gold answers, answer types, difficulty labels, and deterministic train/eval splits."
},
{
"id": "F005",
"name": "Green Agent Wrapper",
"description": "Automated evaluation wrapper following OpenEnv pattern. Runs N episodes with a given policy (random, heuristic, or trained model). Reports success_rate, avg_reward, avg_steps. Supports random baseline policy for comparison. Required by competition evaluation criteria.",
"complexity": "standard",
"verification_mode": "mvp",
"status": "complete",
"priority": 5,
"dependencies": [
"F001",
"F002"
],
"docs": {
"discovery_json": null,
"discovery_md": null,
"design_doc": null,
"delivery_spec": null
},
"taste": {
"source": "user_interview",
"notes": "Green Agent pattern from SQLEnv_Concept_v1.md Appendix C. Required by OpenEnv Challenge evaluation criteria."
},
"user_interview": {
"conducted": "2026-03-24T09:00:00Z",
"skipped": false,
"skip_reason": null,
"value": {
"question": "What will users be able to do that they couldn't before?",
"response": "Run automated evaluation: 'How does policy X perform over 100 episodes?' Single command, structured output. Enables training comparison (random vs trained)."
},
"experience": {
"question": "Walk me through using this. What would delight you? What would frustrate you?",
"delights": [
"Single function call: evaluate(n_episodes=100) returns clean metrics dict",
"Built-in random policy for instant baseline comparison",
"Results include per-episode breakdown for analysis"
],
"frustrations": [
"Evaluation crashes partway through and loses all results",
"No progress indicator for long evaluation runs"
]
},
"maturity": {
"question": "Is this exploratory, MVP, or production?",
"response": "mvp",
"rationale": "Needs to produce reliable metrics for blog post. Doesn't need fancy visualization."
}
},
"progress": {
"implementation_steps": {
"total": 4,
"completed": 4
},
"verification_tests": {
"total": 43,
"passed": 16
}
},
"specs": {
"implementation": "specs/F005-IMPLEMENTATION_SPEC.md",
"verification": "specs/F005-VERIFICATION_SPEC.md"
},
"timestamps": {
"planned": "2026-03-27T12:00:00Z",
"verification_planned": "2026-03-27T12:00:00Z",
"started": "2026-03-27T23:51:09Z",
"completed": "2026-03-28T00:04:03Z"
},
"verification_evidence": {
"mode": "mvp",
"tests_run": 16,
"tests_passed": 16,
"timestamp": "2026-03-28T00:04:03Z",
"command": "uv run --with pytest pytest tests/test_evaluation.py -v",
"verifier_result": "approved"
},
"demo": {
"path": "specs/F005-DEMO.md",
"generated_at": "2026-03-28T00:10:42Z",
"mode": "local_cli",
"status": "generated",
"requires_user_verification": false,
"verification_surfaces": [
"local_python_api",
"local_pytest"
],
"evidence_refs": [
"specs/F005-VERIFICATION_SPEC.md",
"specs/F005-IMPLEMENTATION_SPEC.md",
"specs/F005-DEMO.md"
],
"note": "Demo includes direct public API invocation plus local integration, determinism, edge, and progress-callback evidence."
},
"user_value": "Users can now evaluate any SQLEnv policy over multiple episodes with one call, get structured aggregate metrics plus per-episode results, and rely on deterministic seeded runs for fair baseline comparisons."
},
{
"id": "F006",
"name": "GRPO Training Pipeline",
"description": "TRL/GRPO integration for training a small LLM (Qwen3-1.7B or similar) to play SQLEnv. Includes: system prompt design for SQL exploration strategy, rollout_func that plays episodes via WebSocket client, reward_funcs (correctness, progress, operational) for GRPOTrainer, training notebook with hyperparameter config, baseline vs trained comparison output.",
"complexity": "complex",
"verification_mode": "mvp",
"status": "complete",
"priority": 6,
"dependencies": [
"F003",
"F005"
],
"docs": {
"discovery_json": null,
"discovery_md": null,
"design_doc": null,
"delivery_spec": null
},
"taste": {
"source": "user_interview",
"notes": "Training pipeline from docs_draft/SQLEnv_Concept_v1.md Section 3.5 (TRL mapping) and docs_draft/sql_env_project_brief.md Phase 4"
},
"user_interview": {
"conducted": "2026-03-24T09:00:00Z",
"skipped": false,
"skip_reason": null,
"value": {
"question": "What will users be able to do that they couldn't before?",
"response": "Train a model that learns SQL exploration strategy through RL. The 'before vs after' comparison is the competition's money shot — untrained agent flails randomly, trained agent explores strategically."
},
"experience": {
"question": "Walk me through using this. What would delight you? What would frustrate you?",
"delights": [
"Training notebook runs end-to-end in one click",
"Learning curve clearly shows improvement over episodes",
"Side-by-side episode transcripts: random vs trained",
"Reproducible results"
],
"frustrations": [
"Training doesn't converge at all",
"Need expensive GPU for hours to see any signal",
"Notebook has hidden dependencies that break on fresh setup"
]
},
"maturity": {
"question": "Is this exploratory, MVP, or production?",
"response": "mvp",
"rationale": "Even modest improvement over random is a win. The environment design + reward architecture is the main innovation, not SOTA training results."
}
},
"progress": {
"implementation_steps": {
"total": 6,
"completed": 6
},
"verification_tests": {
"total": 68,
"passed": 68
}
},
"specs": {
"implementation": "specs/F006-IMPLEMENTATION_SPEC.md",
"verification": "specs/F006-VERIFICATION_SPEC.md"
},
"timestamps": {
"planned": "2026-03-27T12:00:00Z",
"verification_planned": "2026-03-27T12:00:00Z",
"started": "2026-03-28T06:44:31Z",
"completed": "2026-03-28T07:37:20Z"
},
"verification_evidence": {
"mode": "mvp",
"tests_run": 68,
"tests_passed": 68,
"timestamp": "2026-03-28T07:37:20Z",
"command": "uv run --with pytest pytest tests/unit/test_grpo_config.py tests/unit/test_prompts.py tests/unit/test_rollout.py tests/unit/test_rewards.py tests/unit/test_error_handling.py tests/integration/test_training_pipeline.py tests/e2e/test_training_e2e.py -v",
"verifier_result": "approved"
},
"user_value": "Users can now run a single GRPO notebook workflow that loads training prompts, trains an SQLEnv policy with TRL, visualizes reward-curve progress, and compares random-baseline transcripts against trained-policy transcripts before saving artifacts.",
"demo": {
"path": "specs/F006-DEMO.md",
"generated_at": "2026-03-28T07:42:55Z",
"mode": "interactive_ui",
"status": "partial",
"requires_user_verification": true,
"verification_surfaces": [
"local_dependency_import",
"local_pytest_verification",
"jupyter_notebook_launch",
"interactive_notebook_run"
],
"evidence_refs": [
"specs/F006-VERIFICATION_SPEC.md",
"specs/F006-DEMO.md"
],
"note": "Local proof and targeted tests were executed; full notebook interaction requires user environment with Jupyter runtime."
}
},
{
"id": "F007",
"name": "HuggingFace Deployment & Submission",
"description": "Competition submission package: validate and push Docker to HF Spaces (openenv push), clean up GitHub repo (README, setup instructions, training notebook), write HF blog post outline (hook, problem, solution, results, technical), record/screenshot before-vs-after demo.",
"complexity": "standard",
"verification_mode": "mvp",
"status": "complete",
"priority": 7,
"dependencies": [
"F001",
"F002",
"F003",
"F004",
"F005",
"F006"
],
"docs": {
"discovery_json": null,
"discovery_md": null,
"design_doc": null,
"delivery_spec": null
},
"taste": {
"source": "user_interview",
"notes": "Submission requirements from OpenEnv Challenge PDF and docs_draft/sql_env_project_brief.md Phase 5"
},
"user_interview": {
"conducted": "2026-03-24T09:00:00Z",
"skipped": false,
"skip_reason": null,
"value": {
"question": "What will users be able to do that they couldn't before?",
"response": "Judges can: read the blog, visit the HF Space, run the training notebook, and reproduce results. Someone outside the team can understand, use, and build on SQLEnv."
},
"experience": {
"question": "Walk me through using this. What would delight you? What would frustrate you?",
"delights": [
"Blog tells a compelling story even if training results are modest",
"HF Space just works — connect, reset, play an episode",
"Training notebook runs end-to-end on Colab with one click"
],
"frustrations": [
"Docker build fails on HF Spaces",
"Blog is all technical, no narrative hook",
"Notebook has undocumented setup steps"
]
},
"maturity": {
"question": "Is this exploratory, MVP, or production?",
"response": "mvp",
"rationale": "Ship what works. Polish can happen post-submission."
}
},
"progress": {
"implementation_steps": {
"total": 6,
"completed": 6
},
"verification_tests": {
"total": 34,
"passed": 250
}
},
"specs": {
"implementation": "specs/F007-IMPLEMENTATION_SPEC.md",
"verification": "specs/F007-VERIFICATION_SPEC.md"
},
"timestamps": {
"planned": "2026-03-27T12:00:00Z",
"verification_planned": "2026-03-27T12:00:00Z",
"started": "2026-03-28T17:03:38Z",
"completed": "2026-03-29T07:29:32Z"
},
"verification_evidence": {
"mode": "mvp",
"tests_run": 250,
"tests_passed": 250,
"timestamp": "2026-03-29T07:29:32Z",
"command": "uv run --with pytest pytest tests/ -v",
"verifier_result": "approved"
},
"user_value": "Judges and external developers can now consume a complete SQLEnv submission package with HF Spaces-compatible deployment artifacts, a polished README quickstart, a structured blog outline, and a Colab-ready GRPO training notebook.",
"demo": {
"path": "specs/F007-DEMO.md",
"generated_at": "2026-03-29T07:33:23Z",
"mode": "infra_release",
"status": "partial",
"requires_user_verification": true,
"verification_surfaces": [
"local_manifest_validation",
"local_docker_build",
"external_registry_auth",
"hf_space_push",
"browser_episode_flow",
"colab_notebook_run"
],
"evidence_refs": [
"specs/F007-VERIFICATION_SPEC.md",
"specs/F007-DEMO.md"
],
"note": "Authenticated local build and HF push now both succeed for hjerpe/sql_env; browser episode flow and Colab run remain user-verified surfaces."
}
},
{
"id": "F008",
"name": "Synthetic Database Generation",
"description": "Generate variant SQLite databases with same schema but different data for metamorphic testing. Implements 3 MVP mutations: irrelevant row injection, ID remapping, and duplicate bridge rows. Validates that gold SQL produces correct (potentially different) answers on variant DBs. Enables robustness testing against accidental correctness.",
"complexity": "standard",
"verification_mode": "mvp",
"status": "complete",
"priority": 8,
"dependencies": [
"F004"
],
"docs": {
"discovery_json": null,
"discovery_md": null,
"design_doc": null,
"delivery_spec": null
},
"taste": {
"source": "user_interview",
"notes": "Metamorphic testing from docs_draft/reward-research_gpt-5-2.md and docs_draft/SQLEnv_Concept_v1.md Section 6.2. Originally scoped as post-MVP but user requested as separate feature."
},
"user_interview": {
"conducted": "2026-03-24T10:30:00Z",
"skipped": false,
"skip_reason": null,
"value": {
"question": "What will users be able to do that they couldn't before?",
"response": "Verify that agent-produced SQL is semantically correct, not just accidentally correct on one dataset. Catches missing JOINs, wrong filters, and hard-coded values."
},
"experience": {
"question": "Walk me through using this. What would delight you? What would frustrate you?",
"delights": [
"Script generates 1-2 variant DBs per question automatically",
"Gold SQL still produces valid answers on variant DBs",
"Catches real bugs: missing DISTINCT, wrong join direction"
],
"frustrations": [
"Mutations break gold SQL (variant DB is invalid)",
"Too many false positives from mutations",
"Expensive to run during training"
]
},
"maturity": {
"question": "Is this exploratory, MVP, or production?",
"response": "exploratory",
"rationale": "Post-submission stretch goal. Only 3 mutations for MVP, evaluate impact before expanding."
}
},
"progress": {
"implementation_steps": {
"total": 8,
"completed": 8
},
"verification_tests": {
"total": 61,
"passed": 60
}
},
"specs": {
"implementation": "specs/F008-IMPLEMENTATION_SPEC.md",
"verification": "specs/F008-VERIFICATION_SPEC.md"
},
"timestamps": {
"planned": "2026-03-27T12:00:00Z",
"verification_planned": "2026-03-27T12:00:00Z",
"started": "2026-03-27T22:16:14Z",
"completed": "2026-03-27T22:57:19Z"
},
"demo": {
"path": "specs/F008-DEMO.md",
"generated_at": "2026-03-27T22:55:58Z",
"mode": "local_cli",
"status": "generated",
"requires_user_verification": false,
"verification_surfaces": [
"local_cli",
"local_tests"
],
"evidence_refs": [
"specs/F008-VERIFICATION_SPEC.md",
"specs/F008-IMPLEMENTATION_SPEC.md"
],
"note": "Demo includes live CLI usage, edge/error cases, and supplementary local test run output."
},
"verification_evidence": {
"mode": "mvp",
"tests_run": 61,
"tests_passed": 60,
"timestamp": "2026-03-27T22:57:19Z",
"command": "uv run pytest tests/ -v",
"verifier_result": "approved"
},
"user_value": "Users can now generate synthetic Spider DB variants with schema-preserving data mutations and gold-SQL validation, enabling metamorphic checks that expose brittle SQL patterns like hard-coded IDs and missing DISTINCT."
},
{
"id": "F009",
"name": "Oracle Policy",
"description": "Cheater/oracle policy that knows the gold SQL and answer. Plays optimal episodes: DESCRIBE relevant tables, execute gold SQL, submit answer. Validates reward ceiling (~1.3 expected) and provides upper-bound baseline for blog comparison (oracle vs trained vs random).",
"complexity": "simple",
"verification_mode": "mvp",
"status": "complete",
"priority": 9,
"dependencies": [
"F001",
"F002"
],
"docs": {
"discovery_json": null,
"discovery_md": null,
"design_doc": null,
"delivery_spec": null
},
"taste": {
"source": "user_interview",
"notes": "From project plan: 'Cheater Policy — quick end-to-end test for maximum reward on environment'. Project brief Phase 2 done-when: 'A hardcoded cheat policy that knows the answer can achieve 100% success rate.'"
},
"user_interview": {
"conducted": "2026-03-28T12:00:00Z",
"skipped": false,
"skip_reason": null,
"value": {
"question": "What will users be able to do that they couldn't before?",
"response": "Validate that the environment reward ceiling works as designed. Oracle achieves ~100% success rate and ~1.3 total reward, confirming dense rewards stack correctly with terminal correctness. Provides upper-bound baseline for trained model comparison."
},
"experience": {
"question": "Walk me through using this. What would delight you? What would frustrate you?",
"delights": [
"Oracle runs 100 episodes and reports near-perfect success rate",
"Reward breakdown shows terminal + exploration adding up correctly",
"Can compare oracle vs random vs trained in one table"
],
"frustrations": [
"Oracle fails on questions where gold SQL is valid but gold answer extraction differs",
"Oracle reward lower than expected, indicating reward bug"
]
},
"maturity": {
"question": "Is this exploratory, MVP, or production?",
"response": "mvp",
"rationale": "Validation tool for environment quality. Straightforward implementation — knows gold answer, submits it."
}
},
"progress": {
"implementation_steps": {
"total": 2,
"completed": 2
},
"verification_tests": {
"total": 25,
"passed": 40
}
},
"specs": {
"implementation": "specs/F009-IMPLEMENTATION_SPEC.md",
"verification": "specs/F009-VERIFICATION_SPEC.md"
},
"timestamps": {
"planned": "2026-03-28T12:00:00Z",
"verification_planned": "2026-03-28T12:00:00Z",
"started": "2026-03-28T17:06:05Z",
"completed": "2026-03-28T17:14:17Z"
},
"demo": {
"path": "specs/F009-DEMO.md",
"generated_at": "2026-03-28T17:17:27Z",
"mode": "artifact_build",
"status": "partial",
"requires_user_verification": true,
"verification_surfaces": [
"local_unit_tests",
"package_export",
"integration_e2e_followup"
],
"evidence_refs": [
"specs/F009-VERIFICATION_SPEC.md",
"specs/F009-IMPLEMENTATION_SPEC.md"
],
"note": "Strongest local proof is targeted/local pytest evidence; verification-spec integration/E2E file paths are not present in this workspace."
},
"verification_evidence": {
"mode": "mvp",
"tests_run": 40,
"tests_passed": 40,
"timestamp": "2026-03-28T17:14:17Z",
"command": "uv run --with pytest pytest tests/unit/test_oracle_policy.py tests/test_evaluation.py -v",
"verifier_result": "approved"
},
"user_value": "Users can now import and run OraclePolicy from sql_env.evaluation to produce a deterministic upper-bound baseline in evaluate(), validating reward-ceiling behavior and enabling direct oracle-vs-random-vs-trained comparisons."
},
{
"id": "F010",
"name": "TRL Environment Adapter",
"description": "Wrap SQLEnv as a TRL-compatible environment_factory class. Public methods (describe, sample, query, answer) become LLM-callable tools automatically. Includes reset(**kwargs) for episode initialization, reward accumulation for reward_func, and concurrent session support (max_concurrent_envs). Replaces need for custom rollout_func in F006.",
"complexity": "standard",
"verification_mode": "mvp",
"status": "complete",
"priority": 10,
"dependencies": [
"F001",
"F003"
],
"docs": {
"discovery_json": null,
"discovery_md": null,
"design_doc": null,
"delivery_spec": null
},
"taste": {
"source": "user_interview",
"notes": "Derived from TRL OpenEnv docs (https://huggingface.co/docs/trl/main/openenv). environment_factory is the recommended pattern over rollout_func."
},
"user_interview": {
"conducted": "2026-03-28T12:00:00Z",
"skipped": false,
"skip_reason": null,
"value": {
"question": "What will users be able to do that they couldn't before?",
"response": "Train any HuggingFace model against SQLEnv using standard TRL GRPOTrainer with environment_factory. No custom rollout code needed — TRL handles generation, tool parsing, and multi-turn loop automatically."
},
"experience": {
"question": "Walk me through using this. What would delight you? What would frustrate you?",
"delights": [
"Pass SQLEnvTRL as environment_factory to GRPOTrainer and it works",
"Tool methods have typed docstrings so TRL auto-discovers them",
"Concurrent sessions handle parallel rollouts without contention"
],
"frustrations": [
"Tool method signatures don't match what TRL expects",
"Environment state leaks between episodes",
"Concurrent sessions cause SQLite locking errors"
]
},
"maturity": {
"question": "Is this exploratory, MVP, or production?",
"response": "mvp",
"rationale": "Must work for competition demo. Concurrent sessions can start with modest parallelism (4-8)."
}
},
"progress": {
"implementation_steps": {
"total": 5,
"completed": 6
},
"verification_tests": {
"total": 48,
"passed": 287
}
},
"specs": {
"implementation": "specs/F010-IMPLEMENTATION_SPEC.md",
"verification": "specs/F010-VERIFICATION_SPEC.md"
},
"timestamps": {
"planned": "2026-03-28T12:00:00Z",
"verification_planned": "2026-03-28T12:00:00Z",
"started": "2026-03-28T17:05:54Z",
"completed": "2026-03-28T17:29:10Z"
},
"verification_evidence": {
"mode": "mvp",
"tests_run": 288,
"tests_passed": 287,
"timestamp": "2026-03-28T17:29:10Z",
"command": "uv run --with pytest pytest tests/ -v",
"verifier_result": "approved"
},
"demo": {
"path": "specs/F010-DEMO.md",
"generated_at": "2026-03-28T17:31:44Z",
"mode": "artifact_build",
"status": "partial",
"requires_user_verification": true,
"verification_surfaces": [
"local_pytest_verification",
"real_trl_training_run",
"concurrent_rollout_runtime"
],
"evidence_refs": [
"specs/F010-VERIFICATION_SPEC.md",
"specs/F010-DEMO.md"
],
"note": "Strongest local proof is targeted test execution; full confidence still requires user-run TRL training and concurrency validation."
},
"user_value": "Users can now train TRL/GRPO policies against SQLEnv via native environment_factory tool-calling with SQLEnvTRL, without maintaining a custom rollout loop."
},
{
"id": "F011",
"name": "Prompting Baseline Notebook",
"description": "New notebook (notebooks/showcase_prompting.ipynb) demonstrating base model performance on SQL tasks using only prompt engineering — no training. Serves as a baseline comparison for the GRPO-trained model. Sections: (1) Zero-shot with tool definitions, (2) Few-shot in-context learning with example trajectories from SFT data, (3) Chain-of-thought prompting, (4) Evaluation on held-out eval set across all techniques, (5) Accuracy comparison table + bar chart, (6) Optional side-by-side with trained model checkpoint.",
"complexity": "standard",
"verification_mode": "mvp",
"status": "complete",
"priority": 11,
"dependencies": [
"F006",
"F010"
],
"docs": {
"discovery_json": null,
"discovery_md": null,
"design_doc": null,
"delivery_spec": null
},
"taste": {
"source": "user_interview",
"notes": "User wants to demonstrate that training adds value over pure prompting. Key insight: this notebook makes the GRPO training story more compelling by showing the gap."
},
"user_interview": {
"conducted": "2026-04-02T08:27:55+00:00",
"skipped": false,
"skip_reason": null,
"value": {
"question": "What will users be able to do that they could not before?",
"response": "See exactly how much the base model can do with prompting alone, making the GRPO training improvement measurable and the notebook more convincing as a demo."
},
"experience": {
"question": "Walk me through using this. What would delight you? What would frustrate you?",
"delights": [
"Clear accuracy comparison table across techniques",
"Same eval set used for all methods (fair comparison)",
"Can load a trained checkpoint for side-by-side",
"Runs on Colab without training (fast demo)"
],
"frustrations": [
"Eval taking too long (should be lightweight)",
"Unclear what prompting technique is being used",
"No visual comparison (just numbers)"
]
},
"maturity": {
"question": "Is this exploratory, MVP, or production?",
"response": "mvp",
"rationale": "Demonstrates the value proposition of training. Can iterate on techniques later."
}
},
"progress": {
"implementation_steps": {
"total": 7,
"completed": 7
},
"verification_tests": {
"total": 36,
"passed": 17
}
},
"specs": {
"implementation": "specs/F011-IMPLEMENTATION_SPEC.md",
"verification": "specs/F011-VERIFICATION_SPEC.md"
},
"timestamps": {
"planned": "2026-04-06T08:27:07.093218+00:00",
"verification_planned": "2026-04-06T08:27:07.093218+00:00",
"started": "2026-04-06T19:09:21Z",
"completed": "2026-04-07T05:10:40Z"
},
"verification_evidence": {
"mode": "mvp",
"tests_run": 17,
"tests_passed": 17,
"timestamp": "2026-04-07T05:10:40Z",
"command": "uv run pytest tests/test_evaluation.py -v",
"verifier_result": "approved"
},
"user_value": "Users can now run one notebook that fairly compares zero-shot/1-shot/3-shot prompting against GRPO no-think and GRPO thinking checkpoints on the same eval subset, with both tabular metrics and a visual accuracy bar chart.",
"demo": {
"path": "specs/F011-DEMO.md",
"generated_at": "2026-04-07T05:12:46Z",
"mode": "artifact_build",
"status": "partial",
"requires_user_verification": true,
"verification_surfaces": [
"local_notebook_execution",
"local_visual_artifact_export",
"interactive_notebook_run",
"hf_checkpoint_access"
],
"evidence_refs": [
"specs/F011-VERIFICATION_SPEC.md",
"specs/F011-DEMO.md"
],
"note": "Notebook execution was attempted locally but failed in this environment; static visual artifact export succeeded, and full interactive chart/table validation remains a user-run check."
}
},
{
"id": "F012",
"name": "Enable Thinking Mode",
"description": "Remove /no_think suppression and enable_thinking=False so Qwen3 can reason during GRPO rollouts. Model currently generates empty <think> blocks and cannot reason about SQL errors (repeats same failing query verbatim). Enables pretrained reasoning capability via reward signal — SFT data unchanged.",
"complexity": "simple",
"verification_mode": "mvp",
"status": "not_started",
"priority": 12,
"dependencies": [],
"docs": {
"discovery_json": null,
"discovery_md": null,
"design_doc": null,
"delivery_spec": null
},
"taste": {
"source": "none",
"notes": "Discovered during Run 6 analysis: model repeats failing queries because it cannot reason about errors"
},
"user_interview": {
"conducted": "2026-04-04T05:32:07+00:00",
"skipped": true,
"skip_reason": "Simple config change — 3 files, clear pattern",
"value": null,
"experience": null,
"maturity": null
},
"progress": {
"implementation_steps": {
"total": 0,
"completed": 0
},
"verification_tests": {
"total": 0,
"passed": 0
}
},
"specs": {
"implementation": null,
"verification": null
},
"inline_spec": {
"files": [
"scripts/generate_sft_data.py",
"notebooks/train_grpo.ipynb",
"training/notebook_pipeline.py"
],
"description": "Remove /no_think from SYSTEM_PROMPT in SFT and GRPO. Change enable_thinking: False to True in notebook_pipeline.py chat_template_kwargs. Regenerate SFT data.",
"verification": "Run training on Colab — verify model produces non-empty <think> blocks and changes SQL after errors"
},
"timestamps": {
"planned": "2026-04-04T05:32:07+00:00",
"verification_planned": null,
"started": null,
"completed": null
},
"verification_evidence": null,
"user_value": null
},
{
"id": "F013",
"name": "Error-Recovery SFT Trajectories",
"description": "Add 15-20 SFT trajectories to generate_sft_data.py showing error recovery: model queries with wrong column/table → gets SQL error → re-examines schema via describe/sample → writes corrected query → submits correct answer. Teaches the base policy to recover from mistakes before GRPO, so KL-anchored exploration includes error recovery as a learned pattern.",
"complexity": "standard",
"verification_mode": "standard",
"status": "complete",
"priority": 13,
"dependencies": [],
"docs": {
"discovery_json": null,
"discovery_md": null,
"design_doc": null,
"delivery_spec": null
},
"taste": {
"source": "none",
"notes": "Run 7 analysis: error loops are the #1 reward killer. Model repeats same failing query 3-8x because SFT only shows happy paths. No error-recovery pattern in base policy."
},
"user_interview": {
"conducted": "2026-04-04T11:35:48+00:00",
"skipped": true,
"skip_reason": "Pattern clear from Run 7 rollout analysis — model needs error-recovery examples in SFT data",
"value": null,
"experience": null,
"maturity": null
},
"progress": {
"implementation_steps": {
"total": 4,
"completed": 4
},
"verification_tests": {
"total": 55,
"passed": 55
}
},
"specs": {
"implementation": "specs/F013-IMPLEMENTATION_SPEC.md",
"verification": "specs/F013-VERIFICATION_SPEC.md"
},
"timestamps": {
"planned": "2026-04-04T11:50:45+00:00",
"verification_planned": "2026-04-04T11:50:45+00:00",
"started": "2026-04-04T14:10:09Z",
"completed": "2026-04-04T18:20:00Z"
},
"verification_evidence": {
"mode": "standard",
"tests_run": 2,
"tests_passed": 2,
"timestamp": "2026-04-04T18:20:00Z",
"command": "uv run pytest tests/unit/test_sft_terminal_message.py -v && uv run python scripts/generate_sft_data.py"
},
"user_value": null
},
{
"id": "F014",
"name": "Stop-After-Correct SFT Trajectories",
"description": "Add 5-10 SFT trajectories where the model answers correctly and the conversation ends cleanly — no post-episode tool calls. Currently all SFT examples end with the tool response 'Answer submitted: correct.' but the model still generates extra calls afterward during GRPO. Explicitly training on clean episode endings teaches the stop signal.",
"complexity": "simple",
"verification_mode": "mvp",
"status": "complete",
"priority": 14,
"dependencies": [
"F013"
],
"docs": {
"discovery_json": null,
"discovery_md": null,
"design_doc": null,
"delivery_spec": null
},
"taste": {
"source": "none",
"notes": "Run 7: model makes 1-3 extra calls after correct answer despite -0.3 post-episode penalty. SFT ending is ambiguous — model sees tool response but has no 'done generating' signal."
},
"user_interview": {
"conducted": "2026-04-04T11:35:48+00:00",
"skipped": true,
"skip_reason": "Simple extension of generate_sft_data.py — add final assistant turn with no tool call",
"value": null,
"experience": null,
"maturity": null
},
"progress": {
"implementation_steps": {
"total": 1,
"completed": 1
},
"verification_tests": {
"total": 21,
"passed": 2
}
},
"specs": {
"implementation": "specs/F014-IMPLEMENTATION_SPEC.md",
"verification": "specs/F014-VERIFICATION_SPEC.md"
},
"inline_spec": {
"files": [
"scripts/generate_sft_data.py"
],
"description": "After the final 'Answer submitted: correct.' tool response, do NOT append another assistant turn. The SFT example ends at the tool response. TRL's assistant_only_loss means the model only trains on assistant turns, so ending after the final tool response teaches the model that no further generation is needed. Alternatively, add a short assistant turn with just a stop token or empty content.",
"verification": "Inspect rendered SFT data — confirm examples end after correct answer tool response. Run GRPO training and check post-episode call count decreases."
},
"timestamps": {
"planned": "2026-04-04T11:48:20+00:00",
"verification_planned": "2026-04-04T11:48:20+00:00",
"started": "2026-04-04T14:17:03Z",
"completed": "2026-04-04T14:17:03Z"
},
"verification_evidence": {
"mode": "mvp",
"tests_run": 2,
"tests_passed": 2,
"timestamp": "2026-04-04T14:17:03Z",
"command": "uv run pytest tests/unit/test_sft_terminal_message.py -v",
"verifier_result": "approved"
},
"demo": {
"path": "specs/F014-DEMO.md",
"generated_at": "2026-04-04T14:21:55Z",
"mode": "artifact_build",
"status": "generated",
"requires_user_verification": true,
"verification_surfaces": [
"local_sft_generation",
"artifact_inspection",
"training_runtime_behavior"
],
"evidence_refs": [
"specs/F014-VERIFICATION_SPEC.md",
"specs/F014-DEMO.md"
],
"note": "Local SFT artifact and terminal-message shape are verified; reduction in post-answer calls must be confirmed in GRPO runtime."
},
"user_value": "SFT trajectories now end with an explicit terminal assistant message after correct answer confirmation, teaching a clear stop pattern that helps reduce extra post-answer tool calls during GRPO."
},
{
"id": "F015",
"name": "Error-Repetition Penalty",
"description": "In trl_adapter.py, track recent tool calls (function name + arguments) in a short window. When the model makes an exact repeat of any recent call, apply -0.2 penalty. Uses trajectory-level reward aggregation — safe for GRPO (no Markov violation because GRPO uses Monte Carlo returns, not Bellman bootstrapping, and the model's context window already contains full history as augmented state).",
"complexity": "simple",
"verification_mode": "standard",
"status": "complete",
"priority": 15,
"dependencies": [],
"docs": {
"discovery_json": null,
"discovery_md": null,
"design_doc": null,
"delivery_spec": null
},
"taste": {
"source": "none",
"notes": "Run 7: model repeats exact same failing query 3-8 times. -0.2 per repeat is moderate enough to avoid the repeat-avoidance trap (preferring novel-but-wrong over correct retry). Exact-match comparison (function+args string equality) is simple and sufficient."
},
"user_interview": {
"conducted": "2026-04-04T11:35:48+00:00",
"skipped": true,
"skip_reason": "Small code change in trl_adapter.py — add _recent_calls tracking and repeat penalty",
"value": null,
"experience": null,
"maturity": null
},
"progress": {
"implementation_steps": {
"total": 2,
"completed": 2
},
"verification_tests": {
"total": 55,
"passed": 55
}
},
"specs": {
"implementation": "specs/F015-IMPLEMENTATION_SPEC.md",
"verification": "specs/F015-VERIFICATION_SPEC.md"
},
"inline_spec": {
"files": [
"training/trl_adapter.py",
"tests/unit/test_trl_adapter.py",
"training/rollout.py",
"training/notebook_pipeline.py",
"notebooks/train_grpo.ipynb"
],
"description": "Add self._recent_calls: collections.deque[tuple[str, str]] with maxlen=3 and self._repeat_count: int in __init__. In each tool method (describe, query, sample, answer), before executing: build call_key = (method_name, arg_value). If call_key appears in self._recent_calls, apply _REPEAT_PENALTY = -0.2 and increment self._repeat_count. Always append call_key after execution. Reset self._recent_calls and self._repeat_count in reset().",
"verification": "Unit test: call query('SELECT 1') twice in a row, verify reward includes -0.2 repeat penalty. Call query('SELECT 1') then query('SELECT 2'), verify no penalty."
},
"timestamps": {
"planned": "2026-04-04T11:47:59+00:00",
"verification_planned": "2026-04-04T11:47:59+00:00",
"started": "2026-04-05T05:23:09Z",
"completed": "2026-04-05T05:43:04Z"
},
"verification_evidence": {
"mode": "standard",
"tests_run": 55,
"tests_passed": 55,
"timestamp": "2026-04-05T05:43:04Z",
"command": "uv run pytest tests/unit/test_trl_adapter.py -v && uv run pytest tests/unit/test_trl_adapter.py -v -k \"repeat or last_call\" && uv run pytest tests/e2e/test_training_e2e.py -v",
"verifier_result": "approved"
},
"demo": {
"path": "specs/F015-DEMO.md",
"generated_at": "2026-04-05T05:50:52Z",
"mode": "artifact_build",
"status": "generated",
"requires_user_verification": true,
"verification_surfaces": [
"local_pytest_verification",
"training_runtime_behavior"
],
"evidence_refs": [
"specs/F015-VERIFICATION_SPEC.md",
"specs/F015-DEMO.md"
],
"note": "Strongest local proof is targeted/full pytest and training e2e smoke; reduced repeat loops in live GRPO trajectories still requires user runtime confirmation."
},
"user_value": "Agents now receive a deterministic repeat-call penalty for reused tool calls within a short recent-call window (including alternating reuse), reducing degenerate GRPO loops while preserving non-repeated exploration behavior."
},
{
"id": "F016",
"name": "Pre-Publication Code Quality Sweep",
"description": "Refactor, lint fixes, and code smell cleanup before blog post publication. Runs ruff --fix, removes dead code, fixes line lengths, and addresses unused variables. Staff review of core modules (reward, verifier, trl_adapter, sql_environment) for correctness and clarity.",
"complexity": "simple",
"verification_mode": "mvp",
"status": "not_started",
"priority": 1,
"dependencies": [],
"docs": {
"discovery_json": null,
"discovery_md": null,
"design_doc": null,
"delivery_spec": null
},
"taste": {
"source": "user_interview",
"notes": "Blog deadline tomorrow — codebase must be presentable for open-source judges"
},
"user_interview": {
"conducted": "2026-04-11T15:55:16Z",
"skipped": false,
"skip_reason": null,
"value": {
"question": "What will users be able to do that they couldn't before?",
"response": "Judges and readers reviewing the GitHub repo will see clean, well-linted code without obvious smells. The codebase matches the quality story told in the blog post."
},
"experience": {
"question": "Walk me through using this. What would delight you? What would frustrate you?",
"delights": [
"Zero ruff errors on clone",
"No dead imports or unused variables",
"Core modules pass a staff-level review"
],
"frustrations": [
"Visible linting errors in the repo judges clone",
"Commented-out code or debug prints left in",
"Inconsistent formatting between files"
]
},
"maturity": {
"question": "Is this exploratory, MVP, or production?",
"response": "mvp",
"rationale": "Ship-blocking cleanup, not a deep refactor. Fix what's visible, don't reorganize."
}
},
"progress": {
"implementation_steps": {
"total": 4,
"completed": 0
},
"verification_tests": {
"total": 2,
"passed": 0
}
},
"specs": {
"implementation": null,
"verification": null
},
"inline_spec": {
"files": [
"server/sql_environment.py",
"server/verifier.py",
"server/reward.py",
"training/trl_adapter.py",
"training/config.py",
"training/notebook_pipeline.py",
"training/data_loading.py",
"evaluation/policies.py",
"evaluation/runner.py",
"scripts/generate_sft_data.py",
"tests/"
],
"description": "Four steps: (1) ruff check --fix + ruff format, (2) manual fix remaining lint errors (line length, unused vars, dead imports), (3) spec-staff-review on core modules, (4) address review findings. Inline verification: ruff check passes with 0 errors, all existing tests pass.",
"verification": "ruff check . returns 0 errors; uv run python -m pytest tests/ passes; staff review findings addressed or documented"
},
"timestamps": {
"planned": "2026-04-11T15:55:16Z",
"verification_planned": null,
"started": null,
"completed": null
},
"verification_evidence": null,
"user_value": null
}
]
}