Priyansh Saxena commited on
Commit
1435892
·
1 Parent(s): 5b04645

feat: expand scenarios and investigation actions

Browse files
README.md CHANGED
@@ -56,9 +56,11 @@ with env.sync() as client:
56
  | Task | Difficulty | Description |
57
  |------|-----------|-------------|
58
  | `easy` | ⭐ | Single-file bug — missing `zero_grad`, wrong loss |
59
- | `medium` | ⭐⭐ | Multi-file root cause — data leakage, scheduler mismatch |
60
  | `hard` | ⭐⭐⭐ | Silent failure — memory leak, AMP overflow, red herrings |
61
 
 
 
62
  ## Reward Structure
63
 
64
  - **Hypothesis delta** (60%) — reward for improving your bug hypothesis each step
@@ -67,6 +69,33 @@ with env.sync() as client:
67
 
68
  Scores range from `0.0` to `1.0`. Partial credit for correct bug category on hard tasks.
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  ## Environment State
71
 
72
  Each episode provides a synthetic PyTorch repo with:
 
56
  | Task | Difficulty | Description |
57
  |------|-----------|-------------|
58
  | `easy` | ⭐ | Single-file bug — missing `zero_grad`, wrong loss |
59
+ | `medium` | ⭐⭐ | Multi-file root cause — data leakage, learning-rate misconfig |
60
  | `hard` | ⭐⭐⭐ | Silent failure — memory leak, AMP overflow, red herrings |
61
 
62
+ Each difficulty draws from multiple bug templates, so repeated runs do not recycle the same exact failure.
63
+
64
  ## Reward Structure
65
 
66
  - **Hypothesis delta** (60%) — reward for improving your bug hypothesis each step
 
69
 
70
  Scores range from `0.0` to `1.0`. Partial credit for correct bug category on hard tasks.
71
 
72
+ ## Investigation Actions
73
+
74
+ - `reveal_file`: reveal a file from the synthetic repo
75
+ - `extend_loss_curve`: reveal more loss-curve points
76
+ - `extend_gpu_profile`: reveal more GPU profile points
77
+ - `reveal_log_chunk`: append additional training log lines
78
+ - `run_diagnostic`: expose a diagnostic summary report
79
+
80
+ ## Reproducibility
81
+
82
+ Use `SEED` to make scenario selection and artifacts deterministic across runs:
83
+
84
+ ```bash
85
+ set SEED=42
86
+ python inference.py
87
+ ```
88
+
89
+ ## Baseline Scores
90
+
91
+ Run `inference.py` with a fixed `SEED` to record your baseline scores. The script prints per-task `[END]` lines with the final rewards.
92
+
93
+ Example template (fill after running):
94
+
95
+ | Model | Seed | Easy | Medium | Hard |
96
+ |------|------|------|--------|------|
97
+ | gpt-3.5-turbo | 42 | 0.xx | 0.xx | 0.xx |
98
+
99
  ## Environment State
100
 
101
  Each episode provides a synthetic PyTorch repo with:
inference.py CHANGED
@@ -15,6 +15,16 @@ TASKS = os.environ.get("TASKS", "easy,medium,hard")
15
  MAX_STEPS = int(os.environ.get("MAX_STEPS", "5"))
16
  SUCCESS_SCORE_THRESHOLD = float(os.environ.get("SUCCESS_SCORE_THRESHOLD", "0.7"))
17
  MAX_TOTAL_REWARD = float(os.environ.get("MAX_TOTAL_REWARD", "1.0"))
 
 
 
 
 
 
 
 
 
 
18
 
19
 
20
  def _sanitize_field(value: object) -> str:
@@ -57,7 +67,7 @@ You are debugging a PyTorch training job. Respond ONLY with valid JSON matching
57
  }}
58
 
59
  Valid action types: reveal_file, extend_loss_curve, extend_gpu_profile, reveal_log_chunk, run_diagnostic
60
- Valid bug types: missing_zero_grad, data_leakage, memory_leak, learning_rate_too_high, gradient_explosion
61
 
62
  Observation:
63
  {json.dumps(observation)[:8000]}
@@ -76,12 +86,16 @@ async def _run_task(task: str, client: OpenAI) -> None:
76
  rewards: List[float] = []
77
  history: List[str] = []
78
  steps_taken = 0
 
79
 
80
  log_start(task=task, env="pytorch-debug-env", model=MODEL_NAME)
81
 
82
  try:
83
  async with httpx.AsyncClient(timeout=60.0) as session:
84
- reset_resp = await session.post(f"{ENV_URL}/reset", params={"task_id": task})
 
 
 
85
  reset_resp.raise_for_status()
86
  result = reset_resp.json()
87
 
 
15
  MAX_STEPS = int(os.environ.get("MAX_STEPS", "5"))
16
  SUCCESS_SCORE_THRESHOLD = float(os.environ.get("SUCCESS_SCORE_THRESHOLD", "0.7"))
17
  MAX_TOTAL_REWARD = float(os.environ.get("MAX_TOTAL_REWARD", "1.0"))
18
+ SEED = os.environ.get("SEED")
19
+
20
+
21
+ def _parse_seed(value: str | None) -> int | None:
22
+ if value is None:
23
+ return None
24
+ try:
25
+ return int(value)
26
+ except ValueError:
27
+ return None
28
 
29
 
30
  def _sanitize_field(value: object) -> str:
 
67
  }}
68
 
69
  Valid action types: reveal_file, extend_loss_curve, extend_gpu_profile, reveal_log_chunk, run_diagnostic
70
+ Valid bug types: missing_zero_grad, data_leakage, memory_leak, learning_rate_too_high, gradient_explosion, wrong_loss_function, amp_overflow
71
 
72
  Observation:
73
  {json.dumps(observation)[:8000]}
 
86
  rewards: List[float] = []
87
  history: List[str] = []
88
  steps_taken = 0
89
+ seed_value = _parse_seed(SEED)
90
 
91
  log_start(task=task, env="pytorch-debug-env", model=MODEL_NAME)
92
 
93
  try:
94
  async with httpx.AsyncClient(timeout=60.0) as session:
95
+ reset_params = {"task_id": task}
96
+ if seed_value is not None:
97
+ reset_params["seed"] = seed_value
98
+ reset_resp = await session.post(f"{ENV_URL}/reset", params=reset_params)
99
  reset_resp.raise_for_status()
100
  result = reset_resp.json()
101
 
src/pytorch_debug_env/bug_library.py CHANGED
@@ -43,15 +43,66 @@ def dummy_artifact_generator(artifact_type: str, rng):
43
  {"step": int(i), "train_loss": float(base[i] + oscillation[i])}
44
  for i in range(100)
45
  ]
46
- elif artifact_type == "gpu_profile":
47
  t = np.arange(100)
48
  allocated = 2048 + 2.4 * t
49
  return [
50
  {"step": int(i), "allocated_mb": float(allocated[i])}
51
  for i in range(100)
52
  ]
53
- elif artifact_type == "training_log":
54
  return "Epoch 1, Step 0: loss 2.45\nEpoch 1, Step 1: loss 2.43\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  return []
56
 
57
  def mutate_missing_zero_grad(repo_files, rng):
@@ -97,6 +148,52 @@ class ImageDataset(Dataset):
97
  """
98
  return repo_files
99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  BUG_TEMPLATES = [
101
  BugTemplate(
102
  bug_type="missing_zero_grad",
@@ -111,6 +208,19 @@ BUG_TEMPLATES = [
111
  artifact_generator=dummy_artifact_generator,
112
  repo_mutator=mutate_missing_zero_grad,
113
  ),
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  BugTemplate(
115
  bug_type="data_leakage",
116
  category="data",
@@ -124,6 +234,19 @@ BUG_TEMPLATES = [
124
  artifact_generator=dummy_artifact_generator,
125
  repo_mutator=mutate_data_leakage,
126
  ),
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  BugTemplate(
128
  bug_type="memory_leak",
129
  category="resource",
@@ -136,5 +259,18 @@ BUG_TEMPLATES = [
136
  description="Memory leak",
137
  artifact_generator=dummy_artifact_generator,
138
  repo_mutator=mutate_memory_leak,
139
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  ]
 
43
  {"step": int(i), "train_loss": float(base[i] + oscillation[i])}
44
  for i in range(100)
45
  ]
46
+ if artifact_type == "gpu_profile":
47
  t = np.arange(100)
48
  allocated = 2048 + 2.4 * t
49
  return [
50
  {"step": int(i), "allocated_mb": float(allocated[i])}
51
  for i in range(100)
52
  ]
53
+ if artifact_type == "training_log":
54
  return "Epoch 1, Step 0: loss 2.45\nEpoch 1, Step 1: loss 2.43\n"
55
+ if artifact_type == "diagnostic_report":
56
+ return "No critical diagnostics found. Review optimizer and data pipeline."
57
+ return []
58
+
59
+
60
+ def artifact_generator_wrong_loss(artifact_type: str, rng):
61
+ if artifact_type == "loss_curve":
62
+ t = np.arange(100)
63
+ base = 1.8 + 0.05 * np.sin(0.15 * t)
64
+ return [{"step": int(i), "train_loss": float(base[i])} for i in range(100)]
65
+ if artifact_type == "gpu_profile":
66
+ t = np.arange(100)
67
+ allocated = 1900 + 1.8 * t
68
+ return [{"step": int(i), "allocated_mb": float(allocated[i])} for i in range(100)]
69
+ if artifact_type == "training_log":
70
+ return "Epoch 1: loss 1.82, acc 0.11\nEpoch 2: loss 1.80, acc 0.12\n"
71
+ if artifact_type == "diagnostic_report":
72
+ return "Loss plateaus early while accuracy stays near chance. Check loss function."
73
+ return []
74
+
75
+
76
+ def artifact_generator_lr_high(artifact_type: str, rng):
77
+ if artifact_type == "loss_curve":
78
+ t = np.arange(100)
79
+ base = 0.9 + 0.02 * (t ** 1.1)
80
+ return [{"step": int(i), "train_loss": float(base[i])} for i in range(100)]
81
+ if artifact_type == "gpu_profile":
82
+ t = np.arange(100)
83
+ allocated = 2100 + 2.0 * t
84
+ return [{"step": int(i), "allocated_mb": float(allocated[i])} for i in range(100)]
85
+ if artifact_type == "training_log":
86
+ return "Step 10: loss 3.20 (spike)\nStep 20: loss 5.10 (diverged)\n"
87
+ if artifact_type == "diagnostic_report":
88
+ return "Loss spikes suggest unstable updates. Consider lowering learning rate."
89
+ return []
90
+
91
+
92
+ def artifact_generator_amp_overflow(artifact_type: str, rng):
93
+ if artifact_type == "loss_curve":
94
+ t = np.arange(100)
95
+ base = 2.1 * np.exp(-0.008 * t) + 0.2
96
+ base[30:] = base[30:] + 0.6
97
+ return [{"step": int(i), "train_loss": float(base[i])} for i in range(100)]
98
+ if artifact_type == "gpu_profile":
99
+ t = np.arange(100)
100
+ allocated = 2300 + 3.2 * t
101
+ return [{"step": int(i), "allocated_mb": float(allocated[i])} for i in range(100)]
102
+ if artifact_type == "training_log":
103
+ return "AMP: overflow detected, skipping step\nAMP: scale reduced to 32768\n"
104
+ if artifact_type == "diagnostic_report":
105
+ return "AMP overflow warnings observed. Ensure GradScaler is used correctly."
106
  return []
107
 
108
  def mutate_missing_zero_grad(repo_files, rng):
 
148
  """
149
  return repo_files
150
 
151
+
152
+ def mutate_wrong_loss_function(repo_files, rng):
153
+ repo_files["train.py"] = """import torch
154
+ from model.architecture import Net
155
+
156
+ model = Net()
157
+ optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
158
+ criterion = torch.nn.MSELoss() # BUG: wrong loss for classification
159
+
160
+ for epoch in range(10):
161
+ for x, y in dataloader:
162
+ optimizer.zero_grad()
163
+ output = model(x)
164
+ loss = criterion(output, y)
165
+ loss.backward()
166
+ optimizer.step()
167
+ """
168
+ return repo_files
169
+
170
+
171
+ def mutate_learning_rate_too_high(repo_files, rng):
172
+ repo_files["config/training_config.yaml"] = """lr: 1.0
173
+ batch_size: 32
174
+ """
175
+ return repo_files
176
+
177
+
178
+ def mutate_amp_overflow(repo_files, rng):
179
+ repo_files["train.py"] = """import torch
180
+ from model.architecture import Net
181
+
182
+ model = Net().cuda()
183
+ optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
184
+
185
+ for epoch in range(10):
186
+ for x, y in dataloader:
187
+ optimizer.zero_grad()
188
+ with torch.cuda.amp.autocast():
189
+ output = model(x.cuda())
190
+ loss = torch.nn.functional.cross_entropy(output, y.cuda())
191
+ # BUG: missing GradScaler handling can cause overflows
192
+ loss.backward()
193
+ optimizer.step()
194
+ """
195
+ return repo_files
196
+
197
  BUG_TEMPLATES = [
198
  BugTemplate(
199
  bug_type="missing_zero_grad",
 
208
  artifact_generator=dummy_artifact_generator,
209
  repo_mutator=mutate_missing_zero_grad,
210
  ),
211
+ BugTemplate(
212
+ bug_type="wrong_loss_function",
213
+ category="optimization",
214
+ difficulty="easy",
215
+ primary_bug_file="train.py",
216
+ related_files=["config/training_config.yaml"],
217
+ red_herring_file="data/dataset.py",
218
+ fix_strategy="Use CrossEntropyLoss for classification logits",
219
+ line_range=[6, 12],
220
+ description="Wrong loss function",
221
+ artifact_generator=artifact_generator_wrong_loss,
222
+ repo_mutator=mutate_wrong_loss_function,
223
+ ),
224
  BugTemplate(
225
  bug_type="data_leakage",
226
  category="data",
 
234
  artifact_generator=dummy_artifact_generator,
235
  repo_mutator=mutate_data_leakage,
236
  ),
237
+ BugTemplate(
238
+ bug_type="learning_rate_too_high",
239
+ category="optimization",
240
+ difficulty="medium",
241
+ primary_bug_file="config/training_config.yaml",
242
+ related_files=["train.py"],
243
+ red_herring_file="model/attention.py",
244
+ fix_strategy="Reduce learning rate or use a scheduler",
245
+ line_range=[1, 1],
246
+ description="Learning rate too high",
247
+ artifact_generator=artifact_generator_lr_high,
248
+ repo_mutator=mutate_learning_rate_too_high,
249
+ ),
250
  BugTemplate(
251
  bug_type="memory_leak",
252
  category="resource",
 
259
  description="Memory leak",
260
  artifact_generator=dummy_artifact_generator,
261
  repo_mutator=mutate_memory_leak,
262
+ ),
263
+ BugTemplate(
264
+ bug_type="amp_overflow",
265
+ category="numerics",
266
+ difficulty="hard",
267
+ primary_bug_file="train.py",
268
+ related_files=["config/training_config.yaml"],
269
+ red_herring_file="model/architecture.py",
270
+ fix_strategy="Use GradScaler and scale updates for AMP",
271
+ line_range=[7, 13],
272
+ description="AMP overflow",
273
+ artifact_generator=artifact_generator_amp_overflow,
274
+ repo_mutator=mutate_amp_overflow,
275
+ ),
276
  ]
src/pytorch_debug_env/environment.py CHANGED
@@ -10,11 +10,14 @@ from .models import (
10
  PyTorchDebugObservation,
11
  PyTorchDebugState,
12
  )
13
- from .reward import compute_step_reward
14
  from .scenario_generator import ScenarioGenerator
15
  from .graders import grade_easy, grade_medium, grade_hard
16
 
17
  GRADER_MAP = {"easy": grade_easy, "medium": grade_medium, "hard": grade_hard}
 
 
 
18
 
19
 
20
  @dataclass
@@ -24,29 +27,40 @@ class RuntimeState:
24
  current_step: int = 0
25
  revealed_files: List[str] = field(default_factory=list)
26
  hypothesis_history: List[HypothesisRecord] = field(default_factory=list)
 
 
 
 
27
  done: bool = False
28
  final_score: float = 0.0
29
 
30
 
31
  class PyTorchDebugEnv:
32
  def __init__(self, generator: ScenarioGenerator, max_steps: int = 5):
 
33
  self.generator = generator
34
  self.runtime = RuntimeState(max_steps=max_steps)
35
 
36
- async def reset(self, task_id: str = "easy"):
37
- scenario = self.generator.generate(task_id)
 
38
  self.runtime = RuntimeState(
39
  scenario=scenario,
40
  max_steps=5 if task_id == "easy" else 6,
41
  current_step=0,
42
  revealed_files=["train.py", "config/training_config.yaml"],
43
  hypothesis_history=[],
 
 
 
 
44
  done=False,
45
  final_score=0.0,
46
  )
47
  return self._build_observation(last_feedback="Episode reset.")
48
 
49
  async def step(self, action: PyTorchDebugAction):
 
50
  if self.runtime.scenario is None:
51
  raise RuntimeError("Call /reset before /step")
52
 
@@ -58,10 +72,23 @@ class PyTorchDebugEnv:
58
  previous_quality = self.runtime.hypothesis_history[-1].quality if self.runtime.hypothesis_history else 0.0
59
 
60
  investigation_target = None
61
- if action.investigation_action and action.investigation_action.action == "reveal_file":
62
- investigation_target = action.investigation_action.target
63
- if investigation_target in scenario.repo_files and investigation_target not in self.runtime.revealed_files:
64
- self.runtime.revealed_files.append(investigation_target)
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
  committed = action.final_diagnosis.model_dump() if action.commit_diagnosis and action.final_diagnosis else None
67
  reward, components = compute_step_reward(
@@ -73,6 +100,7 @@ class PyTorchDebugEnv:
73
  step_num=self.runtime.current_step,
74
  max_steps=self.runtime.max_steps,
75
  )
 
76
 
77
  if committed:
78
  grader = GRADER_MAP.get(scenario.task_id, grade_easy)
@@ -89,7 +117,7 @@ class PyTorchDebugEnv:
89
  conf_bonus = components["confirmation_bonus"]
90
 
91
  total = 0.60 * delta + 0.20 * inv_reward + 0.20 * diagnosis_reward + conf_bonus
92
- reward = round(min(max(total, 0.0), 1.0), 4)
93
 
94
  self.runtime.hypothesis_history.append(
95
  HypothesisRecord(
@@ -114,6 +142,7 @@ class PyTorchDebugEnv:
114
  }
115
 
116
  async def state(self):
 
117
  scenario = self.runtime.scenario
118
  if not scenario:
119
  return None
@@ -126,6 +155,7 @@ class PyTorchDebugEnv:
126
  remaining_files=[
127
  f for f in scenario.repo_files.keys() if f not in self.runtime.revealed_files
128
  ],
 
129
  done=self.runtime.done,
130
  final_score=self.runtime.final_score,
131
  )
@@ -135,10 +165,18 @@ class PyTorchDebugEnv:
135
  revealed = {k: v for k, v in scenario.repo_files.items() if k in self.runtime.revealed_files}
136
  available = [k for k in scenario.repo_files.keys() if k not in self.runtime.revealed_files]
137
 
138
- loss_window_size = min(len(scenario.loss_curve), 100 * (self.runtime.current_step + 1))
139
- gpu_window_size = min(len(scenario.gpu_profile), 100 * (self.runtime.current_step + 1))
 
 
 
 
 
 
140
  log_lines = scenario.training_log.splitlines()
141
- visible_log = "\n".join(log_lines[-min(len(log_lines), 10 * (self.runtime.current_step + 1)):])
 
 
142
 
143
  return PyTorchDebugObservation(
144
  scenario_id=scenario.scenario_id,
@@ -148,6 +186,7 @@ class PyTorchDebugEnv:
148
  loss_curve_window=scenario.loss_curve[:loss_window_size],
149
  gpu_profile_window=scenario.gpu_profile[:gpu_window_size],
150
  training_log_tail=visible_log,
 
151
  step_num=self.runtime.current_step,
152
  steps_remaining=max(0, self.runtime.max_steps - self.runtime.current_step),
153
  investigation_budget=max(0, self.runtime.max_steps - self.runtime.current_step),
 
10
  PyTorchDebugObservation,
11
  PyTorchDebugState,
12
  )
13
+ from .reward import clamp_score, compute_step_reward
14
  from .scenario_generator import ScenarioGenerator
15
  from .graders import grade_easy, grade_medium, grade_hard
16
 
17
  GRADER_MAP = {"easy": grade_easy, "medium": grade_medium, "hard": grade_hard}
18
+ LOSS_WINDOW_STEP = 25
19
+ GPU_WINDOW_STEP = 25
20
+ LOG_WINDOW_STEP = 10
21
 
22
 
23
  @dataclass
 
27
  current_step: int = 0
28
  revealed_files: List[str] = field(default_factory=list)
29
  hypothesis_history: List[HypothesisRecord] = field(default_factory=list)
30
+ loss_curve_bonus: int = 0
31
+ gpu_profile_bonus: int = 0
32
+ log_tail_bonus: int = 0
33
+ diagnostic_revealed: bool = False
34
  done: bool = False
35
  final_score: float = 0.0
36
 
37
 
38
  class PyTorchDebugEnv:
39
  def __init__(self, generator: ScenarioGenerator, max_steps: int = 5):
40
+ """Create a PyTorch debugging environment with a scenario generator."""
41
  self.generator = generator
42
  self.runtime = RuntimeState(max_steps=max_steps)
43
 
44
+ async def reset(self, task_id: str = "easy", seed: int | None = None):
45
+ """Start a new episode and return the initial observation."""
46
+ scenario = self.generator.generate(task_id, seed=seed)
47
  self.runtime = RuntimeState(
48
  scenario=scenario,
49
  max_steps=5 if task_id == "easy" else 6,
50
  current_step=0,
51
  revealed_files=["train.py", "config/training_config.yaml"],
52
  hypothesis_history=[],
53
+ loss_curve_bonus=0,
54
+ gpu_profile_bonus=0,
55
+ log_tail_bonus=0,
56
+ diagnostic_revealed=False,
57
  done=False,
58
  final_score=0.0,
59
  )
60
  return self._build_observation(last_feedback="Episode reset.")
61
 
62
  async def step(self, action: PyTorchDebugAction):
63
+ """Advance the environment by one step using the provided action."""
64
  if self.runtime.scenario is None:
65
  raise RuntimeError("Call /reset before /step")
66
 
 
72
  previous_quality = self.runtime.hypothesis_history[-1].quality if self.runtime.hypothesis_history else 0.0
73
 
74
  investigation_target = None
75
+ if action.investigation_action:
76
+ action_type = action.investigation_action.action
77
+ if action_type == "reveal_file":
78
+ investigation_target = action.investigation_action.target
79
+ if (
80
+ investigation_target in scenario.repo_files
81
+ and investigation_target not in self.runtime.revealed_files
82
+ ):
83
+ self.runtime.revealed_files.append(investigation_target)
84
+ elif action_type == "extend_loss_curve":
85
+ self.runtime.loss_curve_bonus += 1
86
+ elif action_type == "extend_gpu_profile":
87
+ self.runtime.gpu_profile_bonus += 1
88
+ elif action_type == "reveal_log_chunk":
89
+ self.runtime.log_tail_bonus += 1
90
+ elif action_type == "run_diagnostic":
91
+ self.runtime.diagnostic_revealed = True
92
 
93
  committed = action.final_diagnosis.model_dump() if action.commit_diagnosis and action.final_diagnosis else None
94
  reward, components = compute_step_reward(
 
100
  step_num=self.runtime.current_step,
101
  max_steps=self.runtime.max_steps,
102
  )
103
+ reward = clamp_score(reward)
104
 
105
  if committed:
106
  grader = GRADER_MAP.get(scenario.task_id, grade_easy)
 
117
  conf_bonus = components["confirmation_bonus"]
118
 
119
  total = 0.60 * delta + 0.20 * inv_reward + 0.20 * diagnosis_reward + conf_bonus
120
+ reward = round(clamp_score(min(max(total, 0.0), 1.0)), 4)
121
 
122
  self.runtime.hypothesis_history.append(
123
  HypothesisRecord(
 
142
  }
143
 
144
  async def state(self):
145
+ """Return the current episode state, or None if not started."""
146
  scenario = self.runtime.scenario
147
  if not scenario:
148
  return None
 
155
  remaining_files=[
156
  f for f in scenario.repo_files.keys() if f not in self.runtime.revealed_files
157
  ],
158
+ diagnostic_revealed=self.runtime.diagnostic_revealed,
159
  done=self.runtime.done,
160
  final_score=self.runtime.final_score,
161
  )
 
165
  revealed = {k: v for k, v in scenario.repo_files.items() if k in self.runtime.revealed_files}
166
  available = [k for k in scenario.repo_files.keys() if k not in self.runtime.revealed_files]
167
 
168
+ loss_window_size = min(
169
+ len(scenario.loss_curve),
170
+ LOSS_WINDOW_STEP * (self.runtime.current_step + 1 + self.runtime.loss_curve_bonus),
171
+ )
172
+ gpu_window_size = min(
173
+ len(scenario.gpu_profile),
174
+ GPU_WINDOW_STEP * (self.runtime.current_step + 1 + self.runtime.gpu_profile_bonus),
175
+ )
176
  log_lines = scenario.training_log.splitlines()
177
+ log_window = LOG_WINDOW_STEP * (self.runtime.current_step + 1 + self.runtime.log_tail_bonus)
178
+ visible_log = "\n".join(log_lines[-min(len(log_lines), log_window):])
179
+ diagnostic_report = scenario.diagnostic_report if self.runtime.diagnostic_revealed else None
180
 
181
  return PyTorchDebugObservation(
182
  scenario_id=scenario.scenario_id,
 
186
  loss_curve_window=scenario.loss_curve[:loss_window_size],
187
  gpu_profile_window=scenario.gpu_profile[:gpu_window_size],
188
  training_log_tail=visible_log,
189
+ diagnostic_report=diagnostic_report,
190
  step_num=self.runtime.current_step,
191
  steps_remaining=max(0, self.runtime.max_steps - self.runtime.current_step),
192
  investigation_budget=max(0, self.runtime.max_steps - self.runtime.current_step),
src/pytorch_debug_env/graders.py CHANGED
@@ -1,21 +1,24 @@
1
  # src/pytorch_debug_env/graders.py
2
  from __future__ import annotations
3
 
4
- from .reward import final_diagnosis_score
5
 
6
 
7
  def grade_easy(action: dict, gt: dict) -> float:
8
- return final_diagnosis_score(action, gt)
 
9
 
10
 
11
  def grade_medium(action: dict, gt: dict) -> float:
 
12
  score = final_diagnosis_score(action, gt)
13
  if action.get("affected_file") in gt.get("related_files", []):
14
  score = min(1.0, score + 0.05)
15
- return round(score, 4)
16
 
17
 
18
  def grade_hard(action: dict, gt: dict) -> float:
 
19
  score = final_diagnosis_score(action, gt)
20
 
21
  # partial credit if model gets the right category on subtle bugs
@@ -28,4 +31,4 @@ def grade_hard(action: dict, gt: dict) -> float:
28
  if action.get("affected_file") == gt.get("red_herring_file"):
29
  score = max(0.0, score - 0.1)
30
 
31
- return round(min(score, 1.0), 4)
 
1
  # src/pytorch_debug_env/graders.py
2
  from __future__ import annotations
3
 
4
+ from .reward import clamp_score, final_diagnosis_score
5
 
6
 
7
  def grade_easy(action: dict, gt: dict) -> float:
8
+ """Easy grader: strict match on the core diagnosis fields."""
9
+ return clamp_score(final_diagnosis_score(action, gt))
10
 
11
 
12
  def grade_medium(action: dict, gt: dict) -> float:
13
+ """Medium grader: add small credit for related-file hypotheses."""
14
  score = final_diagnosis_score(action, gt)
15
  if action.get("affected_file") in gt.get("related_files", []):
16
  score = min(1.0, score + 0.05)
17
+ return round(clamp_score(score), 4)
18
 
19
 
20
  def grade_hard(action: dict, gt: dict) -> float:
21
+ """Hard grader: allow category credit, penalize red herrings."""
22
  score = final_diagnosis_score(action, gt)
23
 
24
  # partial credit if model gets the right category on subtle bugs
 
31
  if action.get("affected_file") == gt.get("red_herring_file"):
32
  score = max(0.0, score - 0.1)
33
 
34
+ return round(clamp_score(min(score, 1.0)), 4)
src/pytorch_debug_env/models.py CHANGED
@@ -51,6 +51,7 @@ class PyTorchDebugObservation(BaseModel):
51
  loss_curve_window: List[Dict]
52
  gpu_profile_window: List[Dict]
53
  training_log_tail: str
 
54
  step_num: int
55
  steps_remaining: int
56
  investigation_budget: int
@@ -65,6 +66,7 @@ class PyTorchDebugState(BaseModel):
65
  current_step: int
66
  revealed_files: List[str]
67
  remaining_files: List[str]
 
68
  done: bool
69
  final_score: float = 0.0
70
 
 
51
  loss_curve_window: List[Dict]
52
  gpu_profile_window: List[Dict]
53
  training_log_tail: str
54
+ diagnostic_report: Optional[str] = None
55
  step_num: int
56
  steps_remaining: int
57
  investigation_budget: int
 
66
  current_step: int
67
  revealed_files: List[str]
68
  remaining_files: List[str]
69
+ diagnostic_revealed: bool = False
70
  done: bool
71
  final_score: float = 0.0
72
 
src/pytorch_debug_env/reward.py CHANGED
@@ -3,26 +3,35 @@ from __future__ import annotations
3
 
4
  from .bug_library import BUG_CATEGORIES
5
 
 
 
 
 
 
 
 
6
 
7
  def hypothesis_quality(hypothesis: dict, ground_truth: dict) -> float:
8
- q = 0.0
 
9
 
10
  if hypothesis.get("affected_file") == ground_truth["primary_bug_file"]:
11
- q += 0.45
12
  elif hypothesis.get("affected_file") in ground_truth.get("related_files", []):
13
- q += 0.15
14
 
15
  if hypothesis.get("bug_type") == ground_truth["bug_type"]:
16
- q += 0.40
17
  elif BUG_CATEGORIES.get(hypothesis.get("bug_type")) == BUG_CATEGORIES.get(ground_truth["bug_type"]):
18
- q += 0.13
19
 
20
- calibration = 1.0 - abs(hypothesis.get("confidence", 0.5) - min(q, 1.0))
21
- q += 0.15 * calibration
22
- return round(min(q, 1.0), 4)
23
 
24
 
25
  def final_diagnosis_score(diagnosis: dict, ground_truth: dict) -> float:
 
26
  score = 0.0
27
 
28
  if diagnosis.get("bug_type") == ground_truth["bug_type"]:
@@ -38,10 +47,11 @@ def final_diagnosis_score(diagnosis: dict, ground_truth: dict) -> float:
38
  if diagnosis.get("fix_strategy") == ground_truth["fix_strategy"]:
39
  score += 0.15
40
 
41
- return round(min(score, 1.0), 4)
42
 
43
 
44
  def line_overlap(pred: list[int], actual: list[int]) -> float:
 
45
  p1, p2 = pred
46
  a1, a2 = actual
47
  inter = max(0, min(p2, a2) - max(p1, a1) + 1)
@@ -58,6 +68,7 @@ def compute_step_reward(
58
  step_num: int = 1,
59
  max_steps: int = 5,
60
  ) -> tuple[float, dict]:
 
61
  current_quality = hypothesis_quality(current_hypothesis, ground_truth)
62
  delta = current_quality - previous_quality
63
 
@@ -81,7 +92,7 @@ def compute_step_reward(
81
  diagnosis_reward += max(0.0, 0.08 * (max_steps - step_num))
82
 
83
  total = 0.60 * delta + 0.20 * investigation_reward + 0.20 * diagnosis_reward + confirmation_bonus
84
- total = round(min(max(total, 0.0), 1.0), 4)
85
 
86
  return total, {
87
  "hypothesis_quality": current_quality,
 
3
 
4
  from .bug_library import BUG_CATEGORIES
5
 
6
+ EPSILON = 1e-3
7
+
8
+
9
+ def clamp_score(value: float) -> float:
10
+ """Clamp scores to the open interval (0, 1) for validator compliance."""
11
+ return min(max(value, EPSILON), 1.0 - EPSILON)
12
+
13
 
14
  def hypothesis_quality(hypothesis: dict, ground_truth: dict) -> float:
15
+ """Score how well the current hypothesis matches the ground truth."""
16
+ quality = 0.0
17
 
18
  if hypothesis.get("affected_file") == ground_truth["primary_bug_file"]:
19
+ quality += 0.45
20
  elif hypothesis.get("affected_file") in ground_truth.get("related_files", []):
21
+ quality += 0.15
22
 
23
  if hypothesis.get("bug_type") == ground_truth["bug_type"]:
24
+ quality += 0.40
25
  elif BUG_CATEGORIES.get(hypothesis.get("bug_type")) == BUG_CATEGORIES.get(ground_truth["bug_type"]):
26
+ quality += 0.13
27
 
28
+ calibration = 1.0 - abs(hypothesis.get("confidence", 0.5) - min(quality, 1.0))
29
+ quality += 0.15 * calibration
30
+ return round(min(quality, 1.0), 4)
31
 
32
 
33
  def final_diagnosis_score(diagnosis: dict, ground_truth: dict) -> float:
34
+ """Score the committed diagnosis against the ground truth."""
35
  score = 0.0
36
 
37
  if diagnosis.get("bug_type") == ground_truth["bug_type"]:
 
47
  if diagnosis.get("fix_strategy") == ground_truth["fix_strategy"]:
48
  score += 0.15
49
 
50
+ return round(clamp_score(min(score, 1.0)), 4)
51
 
52
 
53
  def line_overlap(pred: list[int], actual: list[int]) -> float:
54
+ """Compute overlap ratio between two line ranges."""
55
  p1, p2 = pred
56
  a1, a2 = actual
57
  inter = max(0, min(p2, a2) - max(p1, a1) + 1)
 
68
  step_num: int = 1,
69
  max_steps: int = 5,
70
  ) -> tuple[float, dict]:
71
+ """Compute step-level reward and diagnostic components."""
72
  current_quality = hypothesis_quality(current_hypothesis, ground_truth)
73
  delta = current_quality - previous_quality
74
 
 
92
  diagnosis_reward += max(0.0, 0.08 * (max_steps - step_num))
93
 
94
  total = 0.60 * delta + 0.20 * investigation_reward + 0.20 * diagnosis_reward + confirmation_bonus
95
+ total = round(clamp_score(min(max(total, 0.0), 1.0)), 4)
96
 
97
  return total, {
98
  "hypothesis_quality": current_quality,
src/pytorch_debug_env/scenario_generator.py CHANGED
@@ -19,14 +19,17 @@ class Scenario:
19
  loss_curve: List[Dict]
20
  gpu_profile: List[Dict]
21
  training_log: str
 
22
  ground_truth: Dict
23
 
24
 
25
  class ScenarioGenerator:
26
  def __init__(self, bug_templates: List[BugTemplate]):
 
27
  self.bug_templates = bug_templates
28
 
29
  def generate(self, difficulty: str, seed: int | None = None) -> Scenario:
 
30
  rng = random.Random(seed)
31
  candidates = [b for b in self.bug_templates if b.difficulty == difficulty]
32
  if not candidates:
@@ -39,6 +42,7 @@ class ScenarioGenerator:
39
  loss_curve = template.artifact_generator("loss_curve", rng)
40
  gpu_profile = template.artifact_generator("gpu_profile", rng)
41
  training_log = template.artifact_generator("training_log", rng)
 
42
 
43
  ground_truth = {
44
  "bug_type": template.bug_type,
@@ -57,6 +61,7 @@ class ScenarioGenerator:
57
  loss_curve=loss_curve,
58
  gpu_profile=gpu_profile,
59
  training_log=training_log,
 
60
  ground_truth=ground_truth,
61
  )
62
 
 
19
  loss_curve: List[Dict]
20
  gpu_profile: List[Dict]
21
  training_log: str
22
+ diagnostic_report: str
23
  ground_truth: Dict
24
 
25
 
26
  class ScenarioGenerator:
27
  def __init__(self, bug_templates: List[BugTemplate]):
28
+ """Create a generator that samples from a set of bug templates."""
29
  self.bug_templates = bug_templates
30
 
31
  def generate(self, difficulty: str, seed: int | None = None) -> Scenario:
32
+ """Build a scenario with deterministic artifacts when a seed is provided."""
33
  rng = random.Random(seed)
34
  candidates = [b for b in self.bug_templates if b.difficulty == difficulty]
35
  if not candidates:
 
42
  loss_curve = template.artifact_generator("loss_curve", rng)
43
  gpu_profile = template.artifact_generator("gpu_profile", rng)
44
  training_log = template.artifact_generator("training_log", rng)
45
+ diagnostic_report = template.artifact_generator("diagnostic_report", rng)
46
 
47
  ground_truth = {
48
  "bug_type": template.bug_type,
 
61
  loss_curve=loss_curve,
62
  gpu_profile=gpu_profile,
63
  training_log=training_log,
64
+ diagnostic_report=diagnostic_report,
65
  ground_truth=ground_truth,
66
  )
67
 
src/pytorch_debug_env/server.py CHANGED
@@ -27,13 +27,13 @@ async def health():
27
 
28
 
29
  @app.post("/reset")
30
- async def reset(task_id: str = "easy"):
31
  global latest_session_id
32
  session_id = str(uuid4())
33
  env = PyTorchDebugEnv(generator=ScenarioGenerator(BUG_TEMPLATES))
34
  sessions[session_id] = env
35
  latest_session_id = session_id
36
- obs = await env.reset(task_id=task_id)
37
  return {"session_id": session_id, "observation": obs, "done": False}
38
 
39
 
 
27
 
28
 
29
  @app.post("/reset")
30
+ async def reset(task_id: str = "easy", seed: int | None = None):
31
  global latest_session_id
32
  session_id = str(uuid4())
33
  env = PyTorchDebugEnv(generator=ScenarioGenerator(BUG_TEMPLATES))
34
  sessions[session_id] = env
35
  latest_session_id = session_id
36
+ obs = await env.reset(task_id=task_id, seed=seed)
37
  return {"session_id": session_id, "observation": obs, "done": False}
38
 
39
 
tests/test_environment_edge_cases.py CHANGED
@@ -87,7 +87,7 @@ async def test_reward_range_and_info_keys():
87
  ),
88
  )
89
  result = await env.step(action)
90
- assert 0.0 <= result["reward"] <= 1.0
91
  for key in (
92
  "hypothesis_quality",
93
  "hypothesis_delta",
@@ -96,3 +96,69 @@ async def test_reward_range_and_info_keys():
96
  "confirmation_bonus",
97
  ):
98
  assert key in result["info"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  ),
88
  )
89
  result = await env.step(action)
90
+ assert 0.0 < result["reward"] < 1.0
91
  for key in (
92
  "hypothesis_quality",
93
  "hypothesis_delta",
 
96
  "confirmation_bonus",
97
  ):
98
  assert key in result["info"]
99
+
100
+
101
+ @pytest.mark.asyncio
102
+ async def test_extend_loss_curve_increases_window():
103
+ env = make_env()
104
+ await env.reset("easy", seed=123)
105
+ action = PyTorchDebugAction(
106
+ current_hypothesis=base_hypothesis(),
107
+ investigation_action=InvestigationAction(action="extend_loss_curve"),
108
+ )
109
+ extended = await env.step(action)
110
+ extended_len = len(extended["observation"].loss_curve_window)
111
+
112
+ env_base = make_env()
113
+ await env_base.reset("easy", seed=123)
114
+ base = await env_base.step(PyTorchDebugAction(current_hypothesis=base_hypothesis()))
115
+ base_len = len(base["observation"].loss_curve_window)
116
+ assert extended_len > base_len
117
+
118
+
119
+ @pytest.mark.asyncio
120
+ async def test_extend_gpu_profile_increases_window():
121
+ env = make_env()
122
+ await env.reset("easy", seed=321)
123
+ action = PyTorchDebugAction(
124
+ current_hypothesis=base_hypothesis(),
125
+ investigation_action=InvestigationAction(action="extend_gpu_profile"),
126
+ )
127
+ extended = await env.step(action)
128
+ extended_len = len(extended["observation"].gpu_profile_window)
129
+
130
+ env_base = make_env()
131
+ await env_base.reset("easy", seed=321)
132
+ base = await env_base.step(PyTorchDebugAction(current_hypothesis=base_hypothesis()))
133
+ base_len = len(base["observation"].gpu_profile_window)
134
+ assert extended_len > base_len
135
+
136
+
137
+ @pytest.mark.asyncio
138
+ async def test_reveal_log_chunk_extends_tail():
139
+ env = make_env()
140
+ await env.reset("easy", seed=77)
141
+ action = PyTorchDebugAction(
142
+ current_hypothesis=base_hypothesis(),
143
+ investigation_action=InvestigationAction(action="reveal_log_chunk"),
144
+ )
145
+ extended = await env.step(action)
146
+ extended_len = len(extended["observation"].training_log_tail)
147
+
148
+ env_base = make_env()
149
+ await env_base.reset("easy", seed=77)
150
+ base = await env_base.step(PyTorchDebugAction(current_hypothesis=base_hypothesis()))
151
+ base_len = len(base["observation"].training_log_tail)
152
+ assert extended_len >= base_len
153
+
154
+
155
+ @pytest.mark.asyncio
156
+ async def test_run_diagnostic_exposes_report():
157
+ env = make_env()
158
+ await env.reset("easy", seed=11)
159
+ action = PyTorchDebugAction(
160
+ current_hypothesis=base_hypothesis(),
161
+ investigation_action=InvestigationAction(action="run_diagnostic"),
162
+ )
163
+ result = await env.step(action)
164
+ assert result["observation"].diagnostic_report
tests/test_graders.py CHANGED
@@ -16,7 +16,9 @@ def test_grade_easy():
16
  "fix_strategy": "Call optimizer.zero_grad() before loss.backward()",
17
  "confidence": 0.8
18
  }
19
- assert grade_easy(action, gt) > 0.8
 
 
20
 
21
 
22
  def test_grade_medium_related_file_bonus():
@@ -34,7 +36,9 @@ def test_grade_medium_related_file_bonus():
34
  "fix_strategy": "Ensure validation split is strictly separate from training",
35
  "confidence": 0.6,
36
  }
37
- assert grade_medium(action, gt) >= grade_easy(action, gt)
 
 
38
 
39
 
40
  def test_grade_hard_category_partial_credit():
@@ -54,7 +58,9 @@ def test_grade_hard_category_partial_credit():
54
  "fix_strategy": "Use CrossEntropyLoss instead of MSE",
55
  "confidence": 0.5,
56
  }
57
- assert grade_hard(action, gt) >= 0.18
 
 
58
 
59
 
60
  def test_grade_hard_penalizes_red_herring():
@@ -76,3 +82,4 @@ def test_grade_hard_penalizes_red_herring():
76
  }
77
  penalized = grade_hard(action, gt)
78
  assert penalized <= 0.9
 
 
16
  "fix_strategy": "Call optimizer.zero_grad() before loss.backward()",
17
  "confidence": 0.8
18
  }
19
+ score = grade_easy(action, gt)
20
+ assert score > 0.8
21
+ assert score < 1.0
22
 
23
 
24
  def test_grade_medium_related_file_bonus():
 
36
  "fix_strategy": "Ensure validation split is strictly separate from training",
37
  "confidence": 0.6,
38
  }
39
+ score = grade_medium(action, gt)
40
+ assert score >= grade_easy(action, gt)
41
+ assert 0.0 < score < 1.0
42
 
43
 
44
  def test_grade_hard_category_partial_credit():
 
58
  "fix_strategy": "Use CrossEntropyLoss instead of MSE",
59
  "confidence": 0.5,
60
  }
61
+ score = grade_hard(action, gt)
62
+ assert score >= 0.18
63
+ assert 0.0 < score < 1.0
64
 
65
 
66
  def test_grade_hard_penalizes_red_herring():
 
82
  }
83
  penalized = grade_hard(action, gt)
84
  assert penalized <= 0.9
85
+ assert 0.0 < penalized < 1.0
tests/test_reward.py CHANGED
@@ -1,5 +1,6 @@
1
  # tests/test_reward.py
2
  from src.pytorch_debug_env.reward import (
 
3
  compute_step_reward,
4
  final_diagnosis_score,
5
  hypothesis_quality,
@@ -39,7 +40,8 @@ def test_final_diagnosis_score_bounds():
39
  "line_range": [10, 12],
40
  "fix_strategy": "Call optimizer.zero_grad() before loss.backward()",
41
  }
42
- assert 0.0 <= final_diagnosis_score(action, gt) <= 1.0
 
43
 
44
 
45
  def test_compute_step_reward_clamps_non_negative():
@@ -65,5 +67,10 @@ def test_compute_step_reward_clamps_non_negative():
65
  step_num=1,
66
  max_steps=5,
67
  )
68
- assert reward >= 0.0
69
  assert components["investigation_reward"] <= 0.0
 
 
 
 
 
 
1
  # tests/test_reward.py
2
  from src.pytorch_debug_env.reward import (
3
+ clamp_score,
4
  compute_step_reward,
5
  final_diagnosis_score,
6
  hypothesis_quality,
 
40
  "line_range": [10, 12],
41
  "fix_strategy": "Call optimizer.zero_grad() before loss.backward()",
42
  }
43
+ score = final_diagnosis_score(action, gt)
44
+ assert 0.0 < score < 1.0
45
 
46
 
47
  def test_compute_step_reward_clamps_non_negative():
 
67
  step_num=1,
68
  max_steps=5,
69
  )
70
+ assert 0.0 < reward < 1.0
71
  assert components["investigation_reward"] <= 0.0
72
+
73
+
74
+ def test_clamp_score_open_interval():
75
+ assert 0.0 < clamp_score(0.0) < 1.0
76
+ assert 0.0 < clamp_score(1.0) < 1.0
tests/test_scenario_generator.py CHANGED
@@ -8,3 +8,12 @@ def test_generate_invalid_difficulty_raises():
8
  generator = ScenarioGenerator(BUG_TEMPLATES)
9
  with pytest.raises(ValueError):
10
  generator.generate("unknown")
 
 
 
 
 
 
 
 
 
 
8
  generator = ScenarioGenerator(BUG_TEMPLATES)
9
  with pytest.raises(ValueError):
10
  generator.generate("unknown")
11
+
12
+
13
+ def test_generate_seed_reproducibility():
14
+ generator = ScenarioGenerator(BUG_TEMPLATES)
15
+ first = generator.generate("easy", seed=123)
16
+ second = generator.generate("easy", seed=123)
17
+ assert first.ground_truth == second.ground_truth
18
+ assert first.repo_files == second.repo_files
19
+ assert first.training_log == second.training_log