Spaces:

ArchCoder
/

Openenv

Sleeping

App Files Files Community

Priyansh Saxena commited on 3 days ago

Commit

1435892

1 Parent(s): 5b04645

feat: expand scenarios and investigation actions

Browse files

Files changed (13) hide show

README.md +30 -1
inference.py +16 -2
src/pytorch_debug_env/bug_library.py +139 -3
src/pytorch_debug_env/environment.py +50 -11
src/pytorch_debug_env/graders.py +7 -4
src/pytorch_debug_env/models.py +2 -0
src/pytorch_debug_env/reward.py +21 -10
src/pytorch_debug_env/scenario_generator.py +5 -0
src/pytorch_debug_env/server.py +2 -2
tests/test_environment_edge_cases.py +67 -1
tests/test_graders.py +10 -3
tests/test_reward.py +9 -2
tests/test_scenario_generator.py +9 -0

README.md CHANGED Viewed

@@ -56,9 +56,11 @@ with env.sync() as client:
 | Task | Difficulty | Description |
 |------|-----------|-------------|
 | `easy` | ⭐ | Single-file bug — missing `zero_grad`, wrong loss |
-| `medium` | ⭐⭐ | Multi-file root cause — data leakage, scheduler mismatch |
 | `hard` | ⭐⭐⭐ | Silent failure — memory leak, AMP overflow, red herrings |
 ## Reward Structure
 - **Hypothesis delta** (60%) — reward for improving your bug hypothesis each step
@@ -67,6 +69,33 @@ with env.sync() as client:
 Scores range from `0.0` to `1.0`. Partial credit for correct bug category on hard tasks.
 ## Environment State
 Each episode provides a synthetic PyTorch repo with:

 | Task | Difficulty | Description |
 |------|-----------|-------------|
 | `easy` | ⭐ | Single-file bug — missing `zero_grad`, wrong loss |
+| `medium` | ⭐⭐ | Multi-file root cause — data leakage, learning-rate misconfig |
 | `hard` | ⭐⭐⭐ | Silent failure — memory leak, AMP overflow, red herrings |
+Each difficulty draws from multiple bug templates, so repeated runs do not recycle the same exact failure.
 ## Reward Structure
 - **Hypothesis delta** (60%) — reward for improving your bug hypothesis each step
 Scores range from `0.0` to `1.0`. Partial credit for correct bug category on hard tasks.
+## Investigation Actions
+- `reveal_file`: reveal a file from the synthetic repo
+- `extend_loss_curve`: reveal more loss-curve points
+- `extend_gpu_profile`: reveal more GPU profile points
+- `reveal_log_chunk`: append additional training log lines
+- `run_diagnostic`: expose a diagnostic summary report
+## Reproducibility
+Use `SEED` to make scenario selection and artifacts deterministic across runs:
+```bash
+set SEED=42
+python inference.py
+```
+## Baseline Scores
+Run `inference.py` with a fixed `SEED` to record your baseline scores. The script prints per-task `[END]` lines with the final rewards.
+Example template (fill after running):
+| Model | Seed | Easy | Medium | Hard |
+|------|------|------|--------|------|
+| gpt-3.5-turbo | 42 | 0.xx | 0.xx | 0.xx |
 ## Environment State
 Each episode provides a synthetic PyTorch repo with:

inference.py CHANGED Viewed

@@ -15,6 +15,16 @@ TASKS = os.environ.get("TASKS", "easy,medium,hard")
 MAX_STEPS = int(os.environ.get("MAX_STEPS", "5"))
 SUCCESS_SCORE_THRESHOLD = float(os.environ.get("SUCCESS_SCORE_THRESHOLD", "0.7"))
 MAX_TOTAL_REWARD = float(os.environ.get("MAX_TOTAL_REWARD", "1.0"))
 def _sanitize_field(value: object) -> str:
@@ -57,7 +67,7 @@ You are debugging a PyTorch training job. Respond ONLY with valid JSON matching
 }}
 Valid action types: reveal_file, extend_loss_curve, extend_gpu_profile, reveal_log_chunk, run_diagnostic
-Valid bug types: missing_zero_grad, data_leakage, memory_leak, learning_rate_too_high, gradient_explosion
 Observation:
 {json.dumps(observation)[:8000]}
@@ -76,12 +86,16 @@ async def _run_task(task: str, client: OpenAI) -> None:
     rewards: List[float] = []
     history: List[str] = []
     steps_taken = 0
     log_start(task=task, env="pytorch-debug-env", model=MODEL_NAME)
     try:
         async with httpx.AsyncClient(timeout=60.0) as session:
-            reset_resp = await session.post(f"{ENV_URL}/reset", params={"task_id": task})
             reset_resp.raise_for_status()
             result = reset_resp.json()

 MAX_STEPS = int(os.environ.get("MAX_STEPS", "5"))
 SUCCESS_SCORE_THRESHOLD = float(os.environ.get("SUCCESS_SCORE_THRESHOLD", "0.7"))
 MAX_TOTAL_REWARD = float(os.environ.get("MAX_TOTAL_REWARD", "1.0"))
+SEED = os.environ.get("SEED")
+def _parse_seed(value: str | None) -> int | None:
+    if value is None:
+        return None
+    try:
+        return int(value)
+    except ValueError:
+        return None
 def _sanitize_field(value: object) -> str:
 }}
 Valid action types: reveal_file, extend_loss_curve, extend_gpu_profile, reveal_log_chunk, run_diagnostic
+Valid bug types: missing_zero_grad, data_leakage, memory_leak, learning_rate_too_high, gradient_explosion, wrong_loss_function, amp_overflow
 Observation:
 {json.dumps(observation)[:8000]}
     rewards: List[float] = []
     history: List[str] = []
     steps_taken = 0
+    seed_value = _parse_seed(SEED)
     log_start(task=task, env="pytorch-debug-env", model=MODEL_NAME)
     try:
         async with httpx.AsyncClient(timeout=60.0) as session:
+            reset_params = {"task_id": task}
+            if seed_value is not None:
+                reset_params["seed"] = seed_value
+            reset_resp = await session.post(f"{ENV_URL}/reset", params=reset_params)
             reset_resp.raise_for_status()
             result = reset_resp.json()

src/pytorch_debug_env/bug_library.py CHANGED Viewed

@@ -43,15 +43,66 @@ def dummy_artifact_generator(artifact_type: str, rng):
             {"step": int(i), "train_loss": float(base[i] + oscillation[i])}
             for i in range(100)
         ]
-    elif artifact_type == "gpu_profile":
         t = np.arange(100)
         allocated = 2048 + 2.4 * t
         return [
             {"step": int(i), "allocated_mb": float(allocated[i])}
             for i in range(100)
         ]
-    elif artifact_type == "training_log":
         return "Epoch 1, Step 0: loss 2.45\nEpoch 1, Step 1: loss 2.43\n"
     return []
 def mutate_missing_zero_grad(repo_files, rng):
@@ -97,6 +148,52 @@ class ImageDataset(Dataset):
 """
     return repo_files
 BUG_TEMPLATES = [
     BugTemplate(
         bug_type="missing_zero_grad",
@@ -111,6 +208,19 @@ BUG_TEMPLATES = [
         artifact_generator=dummy_artifact_generator,
         repo_mutator=mutate_missing_zero_grad,
     ),
     BugTemplate(
         bug_type="data_leakage",
         category="data",
@@ -124,6 +234,19 @@ BUG_TEMPLATES = [
         artifact_generator=dummy_artifact_generator,
         repo_mutator=mutate_data_leakage,
     ),
     BugTemplate(
         bug_type="memory_leak",
         category="resource",
@@ -136,5 +259,18 @@ BUG_TEMPLATES = [
         description="Memory leak",
         artifact_generator=dummy_artifact_generator,
         repo_mutator=mutate_memory_leak,
-    )
 ]

             {"step": int(i), "train_loss": float(base[i] + oscillation[i])}
             for i in range(100)
         ]
+    if artifact_type == "gpu_profile":
         t = np.arange(100)
         allocated = 2048 + 2.4 * t
         return [
             {"step": int(i), "allocated_mb": float(allocated[i])}
             for i in range(100)
         ]
+    if artifact_type == "training_log":
         return "Epoch 1, Step 0: loss 2.45\nEpoch 1, Step 1: loss 2.43\n"
+    if artifact_type == "diagnostic_report":
+        return "No critical diagnostics found. Review optimizer and data pipeline."
+    return []
+def artifact_generator_wrong_loss(artifact_type: str, rng):
+    if artifact_type == "loss_curve":
+        t = np.arange(100)
+        base = 1.8 + 0.05 * np.sin(0.15 * t)
+        return [{"step": int(i), "train_loss": float(base[i])} for i in range(100)]
+    if artifact_type == "gpu_profile":
+        t = np.arange(100)
+        allocated = 1900 + 1.8 * t
+        return [{"step": int(i), "allocated_mb": float(allocated[i])} for i in range(100)]
+    if artifact_type == "training_log":
+        return "Epoch 1: loss 1.82, acc 0.11\nEpoch 2: loss 1.80, acc 0.12\n"
+    if artifact_type == "diagnostic_report":
+        return "Loss plateaus early while accuracy stays near chance. Check loss function."
+    return []
+def artifact_generator_lr_high(artifact_type: str, rng):
+    if artifact_type == "loss_curve":
+        t = np.arange(100)
+        base = 0.9 + 0.02 * (t ** 1.1)
+        return [{"step": int(i), "train_loss": float(base[i])} for i in range(100)]
+    if artifact_type == "gpu_profile":
+        t = np.arange(100)
+        allocated = 2100 + 2.0 * t
+        return [{"step": int(i), "allocated_mb": float(allocated[i])} for i in range(100)]
+    if artifact_type == "training_log":
+        return "Step 10: loss 3.20 (spike)\nStep 20: loss 5.10 (diverged)\n"
+    if artifact_type == "diagnostic_report":
+        return "Loss spikes suggest unstable updates. Consider lowering learning rate."
+    return []
+def artifact_generator_amp_overflow(artifact_type: str, rng):
+    if artifact_type == "loss_curve":
+        t = np.arange(100)
+        base = 2.1 * np.exp(-0.008 * t) + 0.2
+        base[30:] = base[30:] + 0.6
+        return [{"step": int(i), "train_loss": float(base[i])} for i in range(100)]
+    if artifact_type == "gpu_profile":
+        t = np.arange(100)
+        allocated = 2300 + 3.2 * t
+        return [{"step": int(i), "allocated_mb": float(allocated[i])} for i in range(100)]
+    if artifact_type == "training_log":
+        return "AMP: overflow detected, skipping step\nAMP: scale reduced to 32768\n"
+    if artifact_type == "diagnostic_report":
+        return "AMP overflow warnings observed. Ensure GradScaler is used correctly."
     return []
 def mutate_missing_zero_grad(repo_files, rng):
 """
     return repo_files
+def mutate_wrong_loss_function(repo_files, rng):
+    repo_files["train.py"] = """import torch
+from model.architecture import Net
+model = Net()
+optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
+criterion = torch.nn.MSELoss()  # BUG: wrong loss for classification
+for epoch in range(10):
+    for x, y in dataloader:
+        optimizer.zero_grad()
+        output = model(x)
+        loss = criterion(output, y)
+        loss.backward()
+        optimizer.step()
+"""
+    return repo_files
+def mutate_learning_rate_too_high(repo_files, rng):
+    repo_files["config/training_config.yaml"] = """lr: 1.0
+batch_size: 32
+"""
+    return repo_files
+def mutate_amp_overflow(repo_files, rng):
+    repo_files["train.py"] = """import torch
+from model.architecture import Net
+model = Net().cuda()
+optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
+for epoch in range(10):
+    for x, y in dataloader:
+        optimizer.zero_grad()
+        with torch.cuda.amp.autocast():
+            output = model(x.cuda())
+            loss = torch.nn.functional.cross_entropy(output, y.cuda())
+        # BUG: missing GradScaler handling can cause overflows
+        loss.backward()
+        optimizer.step()
+"""
+    return repo_files
 BUG_TEMPLATES = [
     BugTemplate(
         bug_type="missing_zero_grad",
         artifact_generator=dummy_artifact_generator,
         repo_mutator=mutate_missing_zero_grad,
     ),
+    BugTemplate(
+        bug_type="wrong_loss_function",
+        category="optimization",
+        difficulty="easy",
+        primary_bug_file="train.py",
+        related_files=["config/training_config.yaml"],
+        red_herring_file="data/dataset.py",
+        fix_strategy="Use CrossEntropyLoss for classification logits",
+        line_range=[6, 12],
+        description="Wrong loss function",
+        artifact_generator=artifact_generator_wrong_loss,
+        repo_mutator=mutate_wrong_loss_function,
+    ),
     BugTemplate(
         bug_type="data_leakage",
         category="data",
         artifact_generator=dummy_artifact_generator,
         repo_mutator=mutate_data_leakage,
     ),
+    BugTemplate(
+        bug_type="learning_rate_too_high",
+        category="optimization",
+        difficulty="medium",
+        primary_bug_file="config/training_config.yaml",
+        related_files=["train.py"],
+        red_herring_file="model/attention.py",
+        fix_strategy="Reduce learning rate or use a scheduler",
+        line_range=[1, 1],
+        description="Learning rate too high",
+        artifact_generator=artifact_generator_lr_high,
+        repo_mutator=mutate_learning_rate_too_high,
+    ),
     BugTemplate(
         bug_type="memory_leak",
         category="resource",
         description="Memory leak",
         artifact_generator=dummy_artifact_generator,
         repo_mutator=mutate_memory_leak,
+    ),
+    BugTemplate(
+        bug_type="amp_overflow",
+        category="numerics",
+        difficulty="hard",
+        primary_bug_file="train.py",
+        related_files=["config/training_config.yaml"],
+        red_herring_file="model/architecture.py",
+        fix_strategy="Use GradScaler and scale updates for AMP",
+        line_range=[7, 13],
+        description="AMP overflow",
+        artifact_generator=artifact_generator_amp_overflow,
+        repo_mutator=mutate_amp_overflow,
+    ),
 ]

src/pytorch_debug_env/environment.py CHANGED Viewed

@@ -10,11 +10,14 @@ from .models import (
     PyTorchDebugObservation,
     PyTorchDebugState,
 )
-from .reward import compute_step_reward
 from .scenario_generator import ScenarioGenerator
 from .graders import grade_easy, grade_medium, grade_hard
 GRADER_MAP = {"easy": grade_easy, "medium": grade_medium, "hard": grade_hard}
 @dataclass
@@ -24,29 +27,40 @@ class RuntimeState:
     current_step: int = 0
     revealed_files: List[str] = field(default_factory=list)
     hypothesis_history: List[HypothesisRecord] = field(default_factory=list)
     done: bool = False
     final_score: float = 0.0
 class PyTorchDebugEnv:
     def __init__(self, generator: ScenarioGenerator, max_steps: int = 5):
         self.generator = generator
         self.runtime = RuntimeState(max_steps=max_steps)
-    async def reset(self, task_id: str = "easy"):
-        scenario = self.generator.generate(task_id)
         self.runtime = RuntimeState(
             scenario=scenario,
             max_steps=5 if task_id == "easy" else 6,
             current_step=0,
             revealed_files=["train.py", "config/training_config.yaml"],
             hypothesis_history=[],
             done=False,
             final_score=0.0,
         )
         return self._build_observation(last_feedback="Episode reset.")
     async def step(self, action: PyTorchDebugAction):
         if self.runtime.scenario is None:
             raise RuntimeError("Call /reset before /step")
@@ -58,10 +72,23 @@ class PyTorchDebugEnv:
         previous_quality = self.runtime.hypothesis_history[-1].quality if self.runtime.hypothesis_history else 0.0
         investigation_target = None
-        if action.investigation_action and action.investigation_action.action == "reveal_file":
-            investigation_target = action.investigation_action.target
-            if investigation_target in scenario.repo_files and investigation_target not in self.runtime.revealed_files:
-                self.runtime.revealed_files.append(investigation_target)
         committed = action.final_diagnosis.model_dump() if action.commit_diagnosis and action.final_diagnosis else None
         reward, components = compute_step_reward(
@@ -73,6 +100,7 @@ class PyTorchDebugEnv:
             step_num=self.runtime.current_step,
             max_steps=self.runtime.max_steps,
         )
         if committed:
             grader = GRADER_MAP.get(scenario.task_id, grade_easy)
@@ -89,7 +117,7 @@ class PyTorchDebugEnv:
             conf_bonus = components["confirmation_bonus"]
             total = 0.60 * delta + 0.20 * inv_reward + 0.20 * diagnosis_reward + conf_bonus
-            reward = round(min(max(total, 0.0), 1.0), 4)
         self.runtime.hypothesis_history.append(
             HypothesisRecord(
@@ -114,6 +142,7 @@ class PyTorchDebugEnv:
         }
     async def state(self):
         scenario = self.runtime.scenario
         if not scenario:
             return None
@@ -126,6 +155,7 @@ class PyTorchDebugEnv:
             remaining_files=[
                 f for f in scenario.repo_files.keys() if f not in self.runtime.revealed_files
             ],
             done=self.runtime.done,
             final_score=self.runtime.final_score,
         )
@@ -135,10 +165,18 @@ class PyTorchDebugEnv:
         revealed = {k: v for k, v in scenario.repo_files.items() if k in self.runtime.revealed_files}
         available = [k for k in scenario.repo_files.keys() if k not in self.runtime.revealed_files]
-        loss_window_size = min(len(scenario.loss_curve), 100 * (self.runtime.current_step + 1))
-        gpu_window_size = min(len(scenario.gpu_profile), 100 * (self.runtime.current_step + 1))
         log_lines = scenario.training_log.splitlines()
-        visible_log = "\n".join(log_lines[-min(len(log_lines), 10 * (self.runtime.current_step + 1)):])
         return PyTorchDebugObservation(
             scenario_id=scenario.scenario_id,
@@ -148,6 +186,7 @@ class PyTorchDebugEnv:
             loss_curve_window=scenario.loss_curve[:loss_window_size],
             gpu_profile_window=scenario.gpu_profile[:gpu_window_size],
             training_log_tail=visible_log,
             step_num=self.runtime.current_step,
             steps_remaining=max(0, self.runtime.max_steps - self.runtime.current_step),
             investigation_budget=max(0, self.runtime.max_steps - self.runtime.current_step),

     PyTorchDebugObservation,
     PyTorchDebugState,
 )
+from .reward import clamp_score, compute_step_reward
 from .scenario_generator import ScenarioGenerator
 from .graders import grade_easy, grade_medium, grade_hard
 GRADER_MAP = {"easy": grade_easy, "medium": grade_medium, "hard": grade_hard}
+LOSS_WINDOW_STEP = 25
+GPU_WINDOW_STEP = 25
+LOG_WINDOW_STEP = 10
 @dataclass
     current_step: int = 0
     revealed_files: List[str] = field(default_factory=list)
     hypothesis_history: List[HypothesisRecord] = field(default_factory=list)
+    loss_curve_bonus: int = 0
+    gpu_profile_bonus: int = 0
+    log_tail_bonus: int = 0
+    diagnostic_revealed: bool = False
     done: bool = False
     final_score: float = 0.0
 class PyTorchDebugEnv:
     def __init__(self, generator: ScenarioGenerator, max_steps: int = 5):
+        """Create a PyTorch debugging environment with a scenario generator."""
         self.generator = generator
         self.runtime = RuntimeState(max_steps=max_steps)
+    async def reset(self, task_id: str = "easy", seed: int | None = None):
+        """Start a new episode and return the initial observation."""
+        scenario = self.generator.generate(task_id, seed=seed)
         self.runtime = RuntimeState(
             scenario=scenario,
             max_steps=5 if task_id == "easy" else 6,
             current_step=0,
             revealed_files=["train.py", "config/training_config.yaml"],
             hypothesis_history=[],
+            loss_curve_bonus=0,
+            gpu_profile_bonus=0,
+            log_tail_bonus=0,
+            diagnostic_revealed=False,
             done=False,
             final_score=0.0,
         )
         return self._build_observation(last_feedback="Episode reset.")
     async def step(self, action: PyTorchDebugAction):
+        """Advance the environment by one step using the provided action."""
         if self.runtime.scenario is None:
             raise RuntimeError("Call /reset before /step")
         previous_quality = self.runtime.hypothesis_history[-1].quality if self.runtime.hypothesis_history else 0.0
         investigation_target = None
+        if action.investigation_action:
+            action_type = action.investigation_action.action
+            if action_type == "reveal_file":
+                investigation_target = action.investigation_action.target
+                if (
+                    investigation_target in scenario.repo_files
+                    and investigation_target not in self.runtime.revealed_files
+                ):
+                    self.runtime.revealed_files.append(investigation_target)
+            elif action_type == "extend_loss_curve":
+                self.runtime.loss_curve_bonus += 1
+            elif action_type == "extend_gpu_profile":
+                self.runtime.gpu_profile_bonus += 1
+            elif action_type == "reveal_log_chunk":
+                self.runtime.log_tail_bonus += 1
+            elif action_type == "run_diagnostic":
+                self.runtime.diagnostic_revealed = True
         committed = action.final_diagnosis.model_dump() if action.commit_diagnosis and action.final_diagnosis else None
         reward, components = compute_step_reward(
             step_num=self.runtime.current_step,
             max_steps=self.runtime.max_steps,
         )
+        reward = clamp_score(reward)
         if committed:
             grader = GRADER_MAP.get(scenario.task_id, grade_easy)
             conf_bonus = components["confirmation_bonus"]
             total = 0.60 * delta + 0.20 * inv_reward + 0.20 * diagnosis_reward + conf_bonus
+            reward = round(clamp_score(min(max(total, 0.0), 1.0)), 4)
         self.runtime.hypothesis_history.append(
             HypothesisRecord(
         }
     async def state(self):
+        """Return the current episode state, or None if not started."""
         scenario = self.runtime.scenario
         if not scenario:
             return None
             remaining_files=[
                 f for f in scenario.repo_files.keys() if f not in self.runtime.revealed_files
             ],
+            diagnostic_revealed=self.runtime.diagnostic_revealed,
             done=self.runtime.done,
             final_score=self.runtime.final_score,
         )
         revealed = {k: v for k, v in scenario.repo_files.items() if k in self.runtime.revealed_files}
         available = [k for k in scenario.repo_files.keys() if k not in self.runtime.revealed_files]
+        loss_window_size = min(
+            len(scenario.loss_curve),
+            LOSS_WINDOW_STEP * (self.runtime.current_step + 1 + self.runtime.loss_curve_bonus),
+        )
+        gpu_window_size = min(
+            len(scenario.gpu_profile),
+            GPU_WINDOW_STEP * (self.runtime.current_step + 1 + self.runtime.gpu_profile_bonus),
+        )
         log_lines = scenario.training_log.splitlines()
+        log_window = LOG_WINDOW_STEP * (self.runtime.current_step + 1 + self.runtime.log_tail_bonus)
+        visible_log = "\n".join(log_lines[-min(len(log_lines), log_window):])
+        diagnostic_report = scenario.diagnostic_report if self.runtime.diagnostic_revealed else None
         return PyTorchDebugObservation(
             scenario_id=scenario.scenario_id,
             loss_curve_window=scenario.loss_curve[:loss_window_size],
             gpu_profile_window=scenario.gpu_profile[:gpu_window_size],
             training_log_tail=visible_log,
+            diagnostic_report=diagnostic_report,
             step_num=self.runtime.current_step,
             steps_remaining=max(0, self.runtime.max_steps - self.runtime.current_step),
             investigation_budget=max(0, self.runtime.max_steps - self.runtime.current_step),

src/pytorch_debug_env/graders.py CHANGED Viewed

@@ -1,21 +1,24 @@
 # src/pytorch_debug_env/graders.py
 from __future__ import annotations
-from .reward import final_diagnosis_score
 def grade_easy(action: dict, gt: dict) -> float:
-    return final_diagnosis_score(action, gt)
 def grade_medium(action: dict, gt: dict) -> float:
     score = final_diagnosis_score(action, gt)
     if action.get("affected_file") in gt.get("related_files", []):
         score = min(1.0, score + 0.05)
-    return round(score, 4)
 def grade_hard(action: dict, gt: dict) -> float:
     score = final_diagnosis_score(action, gt)
     # partial credit if model gets the right category on subtle bugs
@@ -28,4 +31,4 @@ def grade_hard(action: dict, gt: dict) -> float:
     if action.get("affected_file") == gt.get("red_herring_file"):
         score = max(0.0, score - 0.1)
-    return round(min(score, 1.0), 4)

 # src/pytorch_debug_env/graders.py
 from __future__ import annotations
+from .reward import clamp_score, final_diagnosis_score
 def grade_easy(action: dict, gt: dict) -> float:
+    """Easy grader: strict match on the core diagnosis fields."""
+    return clamp_score(final_diagnosis_score(action, gt))
 def grade_medium(action: dict, gt: dict) -> float:
+    """Medium grader: add small credit for related-file hypotheses."""
     score = final_diagnosis_score(action, gt)
     if action.get("affected_file") in gt.get("related_files", []):
         score = min(1.0, score + 0.05)
+    return round(clamp_score(score), 4)
 def grade_hard(action: dict, gt: dict) -> float:
+    """Hard grader: allow category credit, penalize red herrings."""
     score = final_diagnosis_score(action, gt)
     # partial credit if model gets the right category on subtle bugs
     if action.get("affected_file") == gt.get("red_herring_file"):
         score = max(0.0, score - 0.1)
+    return round(clamp_score(min(score, 1.0)), 4)

src/pytorch_debug_env/models.py CHANGED Viewed

@@ -51,6 +51,7 @@ class PyTorchDebugObservation(BaseModel):
     loss_curve_window: List[Dict]
     gpu_profile_window: List[Dict]
     training_log_tail: str
     step_num: int
     steps_remaining: int
     investigation_budget: int
@@ -65,6 +66,7 @@ class PyTorchDebugState(BaseModel):
     current_step: int
     revealed_files: List[str]
     remaining_files: List[str]
     done: bool
     final_score: float = 0.0

     loss_curve_window: List[Dict]
     gpu_profile_window: List[Dict]
     training_log_tail: str
+    diagnostic_report: Optional[str] = None
     step_num: int
     steps_remaining: int
     investigation_budget: int
     current_step: int
     revealed_files: List[str]
     remaining_files: List[str]
+    diagnostic_revealed: bool = False
     done: bool
     final_score: float = 0.0

src/pytorch_debug_env/reward.py CHANGED Viewed

@@ -3,26 +3,35 @@ from __future__ import annotations
 from .bug_library import BUG_CATEGORIES
 def hypothesis_quality(hypothesis: dict, ground_truth: dict) -> float:
-    q = 0.0
     if hypothesis.get("affected_file") == ground_truth["primary_bug_file"]:
-        q += 0.45
     elif hypothesis.get("affected_file") in ground_truth.get("related_files", []):
-        q += 0.15
     if hypothesis.get("bug_type") == ground_truth["bug_type"]:
-        q += 0.40
     elif BUG_CATEGORIES.get(hypothesis.get("bug_type")) == BUG_CATEGORIES.get(ground_truth["bug_type"]):
-        q += 0.13
-    calibration = 1.0 - abs(hypothesis.get("confidence", 0.5) - min(q, 1.0))
-    q += 0.15 * calibration
-    return round(min(q, 1.0), 4)
 def final_diagnosis_score(diagnosis: dict, ground_truth: dict) -> float:
     score = 0.0
     if diagnosis.get("bug_type") == ground_truth["bug_type"]:
@@ -38,10 +47,11 @@ def final_diagnosis_score(diagnosis: dict, ground_truth: dict) -> float:
     if diagnosis.get("fix_strategy") == ground_truth["fix_strategy"]:
         score += 0.15
-    return round(min(score, 1.0), 4)
 def line_overlap(pred: list[int], actual: list[int]) -> float:
     p1, p2 = pred
     a1, a2 = actual
     inter = max(0, min(p2, a2) - max(p1, a1) + 1)
@@ -58,6 +68,7 @@ def compute_step_reward(
     step_num: int = 1,
     max_steps: int = 5,
 ) -> tuple[float, dict]:
     current_quality = hypothesis_quality(current_hypothesis, ground_truth)
     delta = current_quality - previous_quality
@@ -81,7 +92,7 @@ def compute_step_reward(
             diagnosis_reward += max(0.0, 0.08 * (max_steps - step_num))
     total = 0.60 * delta + 0.20 * investigation_reward + 0.20 * diagnosis_reward + confirmation_bonus
-    total = round(min(max(total, 0.0), 1.0), 4)
     return total, {
         "hypothesis_quality": current_quality,

 from .bug_library import BUG_CATEGORIES
+EPSILON = 1e-3
+def clamp_score(value: float) -> float:
+    """Clamp scores to the open interval (0, 1) for validator compliance."""
+    return min(max(value, EPSILON), 1.0 - EPSILON)
 def hypothesis_quality(hypothesis: dict, ground_truth: dict) -> float:
+    """Score how well the current hypothesis matches the ground truth."""
+    quality = 0.0
     if hypothesis.get("affected_file") == ground_truth["primary_bug_file"]:
+        quality += 0.45
     elif hypothesis.get("affected_file") in ground_truth.get("related_files", []):
+        quality += 0.15
     if hypothesis.get("bug_type") == ground_truth["bug_type"]:
+        quality += 0.40
     elif BUG_CATEGORIES.get(hypothesis.get("bug_type")) == BUG_CATEGORIES.get(ground_truth["bug_type"]):
+        quality += 0.13
+    calibration = 1.0 - abs(hypothesis.get("confidence", 0.5) - min(quality, 1.0))
+    quality += 0.15 * calibration
+    return round(min(quality, 1.0), 4)
 def final_diagnosis_score(diagnosis: dict, ground_truth: dict) -> float:
+    """Score the committed diagnosis against the ground truth."""
     score = 0.0
     if diagnosis.get("bug_type") == ground_truth["bug_type"]:
     if diagnosis.get("fix_strategy") == ground_truth["fix_strategy"]:
         score += 0.15
+    return round(clamp_score(min(score, 1.0)), 4)
 def line_overlap(pred: list[int], actual: list[int]) -> float:
+    """Compute overlap ratio between two line ranges."""
     p1, p2 = pred
     a1, a2 = actual
     inter = max(0, min(p2, a2) - max(p1, a1) + 1)
     step_num: int = 1,
     max_steps: int = 5,
 ) -> tuple[float, dict]:
+    """Compute step-level reward and diagnostic components."""
     current_quality = hypothesis_quality(current_hypothesis, ground_truth)
     delta = current_quality - previous_quality
             diagnosis_reward += max(0.0, 0.08 * (max_steps - step_num))
     total = 0.60 * delta + 0.20 * investigation_reward + 0.20 * diagnosis_reward + confirmation_bonus
+    total = round(clamp_score(min(max(total, 0.0), 1.0)), 4)
     return total, {
         "hypothesis_quality": current_quality,

src/pytorch_debug_env/scenario_generator.py CHANGED Viewed

@@ -19,14 +19,17 @@ class Scenario:
     loss_curve: List[Dict]
     gpu_profile: List[Dict]
     training_log: str
     ground_truth: Dict
 class ScenarioGenerator:
     def __init__(self, bug_templates: List[BugTemplate]):
         self.bug_templates = bug_templates
     def generate(self, difficulty: str, seed: int | None = None) -> Scenario:
         rng = random.Random(seed)
         candidates = [b for b in self.bug_templates if b.difficulty == difficulty]
         if not candidates:
@@ -39,6 +42,7 @@ class ScenarioGenerator:
         loss_curve = template.artifact_generator("loss_curve", rng)
         gpu_profile = template.artifact_generator("gpu_profile", rng)
         training_log = template.artifact_generator("training_log", rng)
         ground_truth = {
             "bug_type": template.bug_type,
@@ -57,6 +61,7 @@ class ScenarioGenerator:
             loss_curve=loss_curve,
             gpu_profile=gpu_profile,
             training_log=training_log,
             ground_truth=ground_truth,
         )

     loss_curve: List[Dict]
     gpu_profile: List[Dict]
     training_log: str
+    diagnostic_report: str
     ground_truth: Dict
 class ScenarioGenerator:
     def __init__(self, bug_templates: List[BugTemplate]):
+        """Create a generator that samples from a set of bug templates."""
         self.bug_templates = bug_templates
     def generate(self, difficulty: str, seed: int | None = None) -> Scenario:
+        """Build a scenario with deterministic artifacts when a seed is provided."""
         rng = random.Random(seed)
         candidates = [b for b in self.bug_templates if b.difficulty == difficulty]
         if not candidates:
         loss_curve = template.artifact_generator("loss_curve", rng)
         gpu_profile = template.artifact_generator("gpu_profile", rng)
         training_log = template.artifact_generator("training_log", rng)
+        diagnostic_report = template.artifact_generator("diagnostic_report", rng)
         ground_truth = {
             "bug_type": template.bug_type,
             loss_curve=loss_curve,
             gpu_profile=gpu_profile,
             training_log=training_log,
+            diagnostic_report=diagnostic_report,
             ground_truth=ground_truth,
         )

src/pytorch_debug_env/server.py CHANGED Viewed

@@ -27,13 +27,13 @@ async def health():
 @app.post("/reset")
-async def reset(task_id: str = "easy"):
     global latest_session_id
     session_id = str(uuid4())
     env = PyTorchDebugEnv(generator=ScenarioGenerator(BUG_TEMPLATES))
     sessions[session_id] = env
     latest_session_id = session_id
-    obs = await env.reset(task_id=task_id)
     return {"session_id": session_id, "observation": obs, "done": False}

 @app.post("/reset")
+async def reset(task_id: str = "easy", seed: int | None = None):
     global latest_session_id
     session_id = str(uuid4())
     env = PyTorchDebugEnv(generator=ScenarioGenerator(BUG_TEMPLATES))
     sessions[session_id] = env
     latest_session_id = session_id
+    obs = await env.reset(task_id=task_id, seed=seed)
     return {"session_id": session_id, "observation": obs, "done": False}

tests/test_environment_edge_cases.py CHANGED Viewed

@@ -87,7 +87,7 @@ async def test_reward_range_and_info_keys():
         ),
     )
     result = await env.step(action)
-    assert 0.0 <= result["reward"] <= 1.0
     for key in (
         "hypothesis_quality",
         "hypothesis_delta",
@@ -96,3 +96,69 @@ async def test_reward_range_and_info_keys():
         "confirmation_bonus",
     ):
         assert key in result["info"]

         ),
     )
     result = await env.step(action)
+    assert 0.0 < result["reward"] < 1.0
     for key in (
         "hypothesis_quality",
         "hypothesis_delta",
         "confirmation_bonus",
     ):
         assert key in result["info"]
+@pytest.mark.asyncio
+async def test_extend_loss_curve_increases_window():
+    env = make_env()
+    await env.reset("easy", seed=123)
+    action = PyTorchDebugAction(
+        current_hypothesis=base_hypothesis(),
+        investigation_action=InvestigationAction(action="extend_loss_curve"),
+    )
+    extended = await env.step(action)
+    extended_len = len(extended["observation"].loss_curve_window)
+    env_base = make_env()
+    await env_base.reset("easy", seed=123)
+    base = await env_base.step(PyTorchDebugAction(current_hypothesis=base_hypothesis()))
+    base_len = len(base["observation"].loss_curve_window)
+    assert extended_len > base_len
+@pytest.mark.asyncio
+async def test_extend_gpu_profile_increases_window():
+    env = make_env()
+    await env.reset("easy", seed=321)
+    action = PyTorchDebugAction(
+        current_hypothesis=base_hypothesis(),
+        investigation_action=InvestigationAction(action="extend_gpu_profile"),
+    )
+    extended = await env.step(action)
+    extended_len = len(extended["observation"].gpu_profile_window)
+    env_base = make_env()
+    await env_base.reset("easy", seed=321)
+    base = await env_base.step(PyTorchDebugAction(current_hypothesis=base_hypothesis()))
+    base_len = len(base["observation"].gpu_profile_window)
+    assert extended_len > base_len
+@pytest.mark.asyncio
+async def test_reveal_log_chunk_extends_tail():
+    env = make_env()
+    await env.reset("easy", seed=77)
+    action = PyTorchDebugAction(
+        current_hypothesis=base_hypothesis(),
+        investigation_action=InvestigationAction(action="reveal_log_chunk"),
+    )
+    extended = await env.step(action)
+    extended_len = len(extended["observation"].training_log_tail)
+    env_base = make_env()
+    await env_base.reset("easy", seed=77)
+    base = await env_base.step(PyTorchDebugAction(current_hypothesis=base_hypothesis()))
+    base_len = len(base["observation"].training_log_tail)
+    assert extended_len >= base_len
+@pytest.mark.asyncio
+async def test_run_diagnostic_exposes_report():
+    env = make_env()
+    await env.reset("easy", seed=11)
+    action = PyTorchDebugAction(
+        current_hypothesis=base_hypothesis(),
+        investigation_action=InvestigationAction(action="run_diagnostic"),
+    )
+    result = await env.step(action)
+    assert result["observation"].diagnostic_report

tests/test_graders.py CHANGED Viewed

@@ -16,7 +16,9 @@ def test_grade_easy():
         "fix_strategy": "Call optimizer.zero_grad() before loss.backward()",
         "confidence": 0.8
     }
-    assert grade_easy(action, gt) > 0.8
 def test_grade_medium_related_file_bonus():
@@ -34,7 +36,9 @@ def test_grade_medium_related_file_bonus():
         "fix_strategy": "Ensure validation split is strictly separate from training",
         "confidence": 0.6,
     }
-    assert grade_medium(action, gt) >= grade_easy(action, gt)
 def test_grade_hard_category_partial_credit():
@@ -54,7 +58,9 @@ def test_grade_hard_category_partial_credit():
         "fix_strategy": "Use CrossEntropyLoss instead of MSE",
         "confidence": 0.5,
     }
-    assert grade_hard(action, gt) >= 0.18
 def test_grade_hard_penalizes_red_herring():
@@ -76,3 +82,4 @@ def test_grade_hard_penalizes_red_herring():
     }
     penalized = grade_hard(action, gt)
     assert penalized <= 0.9

         "fix_strategy": "Call optimizer.zero_grad() before loss.backward()",
         "confidence": 0.8
     }
+    score = grade_easy(action, gt)
+    assert score > 0.8
+    assert score < 1.0
 def test_grade_medium_related_file_bonus():
         "fix_strategy": "Ensure validation split is strictly separate from training",
         "confidence": 0.6,
     }
+    score = grade_medium(action, gt)
+    assert score >= grade_easy(action, gt)
+    assert 0.0 < score < 1.0
 def test_grade_hard_category_partial_credit():
         "fix_strategy": "Use CrossEntropyLoss instead of MSE",
         "confidence": 0.5,
     }
+    score = grade_hard(action, gt)
+    assert score >= 0.18
+    assert 0.0 < score < 1.0
 def test_grade_hard_penalizes_red_herring():
     }
     penalized = grade_hard(action, gt)
     assert penalized <= 0.9
+    assert 0.0 < penalized < 1.0

tests/test_reward.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # tests/test_reward.py
 from src.pytorch_debug_env.reward import (
     compute_step_reward,
     final_diagnosis_score,
     hypothesis_quality,
@@ -39,7 +40,8 @@ def test_final_diagnosis_score_bounds():
         "line_range": [10, 12],
         "fix_strategy": "Call optimizer.zero_grad() before loss.backward()",
     }
-    assert 0.0 <= final_diagnosis_score(action, gt) <= 1.0
 def test_compute_step_reward_clamps_non_negative():
@@ -65,5 +67,10 @@ def test_compute_step_reward_clamps_non_negative():
         step_num=1,
         max_steps=5,
     )
-    assert reward >= 0.0
     assert components["investigation_reward"] <= 0.0

 # tests/test_reward.py
 from src.pytorch_debug_env.reward import (
+    clamp_score,
     compute_step_reward,
     final_diagnosis_score,
     hypothesis_quality,
         "line_range": [10, 12],
         "fix_strategy": "Call optimizer.zero_grad() before loss.backward()",
     }
+    score = final_diagnosis_score(action, gt)
+    assert 0.0 < score < 1.0
 def test_compute_step_reward_clamps_non_negative():
         step_num=1,
         max_steps=5,
     )
+    assert 0.0 < reward < 1.0
     assert components["investigation_reward"] <= 0.0
+def test_clamp_score_open_interval():
+    assert 0.0 < clamp_score(0.0) < 1.0
+    assert 0.0 < clamp_score(1.0) < 1.0

tests/test_scenario_generator.py CHANGED Viewed

@@ -8,3 +8,12 @@ def test_generate_invalid_difficulty_raises():
     generator = ScenarioGenerator(BUG_TEMPLATES)
     with pytest.raises(ValueError):
         generator.generate("unknown")

     generator = ScenarioGenerator(BUG_TEMPLATES)
     with pytest.raises(ValueError):
         generator.generate("unknown")
+def test_generate_seed_reproducibility():
+    generator = ScenarioGenerator(BUG_TEMPLATES)
+    first = generator.generate("easy", seed=123)
+    second = generator.generate("easy", seed=123)
+    assert first.ground_truth == second.ground_truth
+    assert first.repo_files == second.repo_files
+    assert first.training_log == second.training_log