Priyansh Saxena commited on
Commit ·
1435892
1
Parent(s): 5b04645
feat: expand scenarios and investigation actions
Browse files- README.md +30 -1
- inference.py +16 -2
- src/pytorch_debug_env/bug_library.py +139 -3
- src/pytorch_debug_env/environment.py +50 -11
- src/pytorch_debug_env/graders.py +7 -4
- src/pytorch_debug_env/models.py +2 -0
- src/pytorch_debug_env/reward.py +21 -10
- src/pytorch_debug_env/scenario_generator.py +5 -0
- src/pytorch_debug_env/server.py +2 -2
- tests/test_environment_edge_cases.py +67 -1
- tests/test_graders.py +10 -3
- tests/test_reward.py +9 -2
- tests/test_scenario_generator.py +9 -0
README.md
CHANGED
|
@@ -56,9 +56,11 @@ with env.sync() as client:
|
|
| 56 |
| Task | Difficulty | Description |
|
| 57 |
|------|-----------|-------------|
|
| 58 |
| `easy` | ⭐ | Single-file bug — missing `zero_grad`, wrong loss |
|
| 59 |
-
| `medium` | ⭐⭐ | Multi-file root cause — data leakage,
|
| 60 |
| `hard` | ⭐⭐⭐ | Silent failure — memory leak, AMP overflow, red herrings |
|
| 61 |
|
|
|
|
|
|
|
| 62 |
## Reward Structure
|
| 63 |
|
| 64 |
- **Hypothesis delta** (60%) — reward for improving your bug hypothesis each step
|
|
@@ -67,6 +69,33 @@ with env.sync() as client:
|
|
| 67 |
|
| 68 |
Scores range from `0.0` to `1.0`. Partial credit for correct bug category on hard tasks.
|
| 69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
## Environment State
|
| 71 |
|
| 72 |
Each episode provides a synthetic PyTorch repo with:
|
|
|
|
| 56 |
| Task | Difficulty | Description |
|
| 57 |
|------|-----------|-------------|
|
| 58 |
| `easy` | ⭐ | Single-file bug — missing `zero_grad`, wrong loss |
|
| 59 |
+
| `medium` | ⭐⭐ | Multi-file root cause — data leakage, learning-rate misconfig |
|
| 60 |
| `hard` | ⭐⭐⭐ | Silent failure — memory leak, AMP overflow, red herrings |
|
| 61 |
|
| 62 |
+
Each difficulty draws from multiple bug templates, so repeated runs do not recycle the same exact failure.
|
| 63 |
+
|
| 64 |
## Reward Structure
|
| 65 |
|
| 66 |
- **Hypothesis delta** (60%) — reward for improving your bug hypothesis each step
|
|
|
|
| 69 |
|
| 70 |
Scores range from `0.0` to `1.0`. Partial credit for correct bug category on hard tasks.
|
| 71 |
|
| 72 |
+
## Investigation Actions
|
| 73 |
+
|
| 74 |
+
- `reveal_file`: reveal a file from the synthetic repo
|
| 75 |
+
- `extend_loss_curve`: reveal more loss-curve points
|
| 76 |
+
- `extend_gpu_profile`: reveal more GPU profile points
|
| 77 |
+
- `reveal_log_chunk`: append additional training log lines
|
| 78 |
+
- `run_diagnostic`: expose a diagnostic summary report
|
| 79 |
+
|
| 80 |
+
## Reproducibility
|
| 81 |
+
|
| 82 |
+
Use `SEED` to make scenario selection and artifacts deterministic across runs:
|
| 83 |
+
|
| 84 |
+
```bash
|
| 85 |
+
set SEED=42
|
| 86 |
+
python inference.py
|
| 87 |
+
```
|
| 88 |
+
|
| 89 |
+
## Baseline Scores
|
| 90 |
+
|
| 91 |
+
Run `inference.py` with a fixed `SEED` to record your baseline scores. The script prints per-task `[END]` lines with the final rewards.
|
| 92 |
+
|
| 93 |
+
Example template (fill after running):
|
| 94 |
+
|
| 95 |
+
| Model | Seed | Easy | Medium | Hard |
|
| 96 |
+
|------|------|------|--------|------|
|
| 97 |
+
| gpt-3.5-turbo | 42 | 0.xx | 0.xx | 0.xx |
|
| 98 |
+
|
| 99 |
## Environment State
|
| 100 |
|
| 101 |
Each episode provides a synthetic PyTorch repo with:
|
inference.py
CHANGED
|
@@ -15,6 +15,16 @@ TASKS = os.environ.get("TASKS", "easy,medium,hard")
|
|
| 15 |
MAX_STEPS = int(os.environ.get("MAX_STEPS", "5"))
|
| 16 |
SUCCESS_SCORE_THRESHOLD = float(os.environ.get("SUCCESS_SCORE_THRESHOLD", "0.7"))
|
| 17 |
MAX_TOTAL_REWARD = float(os.environ.get("MAX_TOTAL_REWARD", "1.0"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
|
| 20 |
def _sanitize_field(value: object) -> str:
|
|
@@ -57,7 +67,7 @@ You are debugging a PyTorch training job. Respond ONLY with valid JSON matching
|
|
| 57 |
}}
|
| 58 |
|
| 59 |
Valid action types: reveal_file, extend_loss_curve, extend_gpu_profile, reveal_log_chunk, run_diagnostic
|
| 60 |
-
Valid bug types: missing_zero_grad, data_leakage, memory_leak, learning_rate_too_high, gradient_explosion
|
| 61 |
|
| 62 |
Observation:
|
| 63 |
{json.dumps(observation)[:8000]}
|
|
@@ -76,12 +86,16 @@ async def _run_task(task: str, client: OpenAI) -> None:
|
|
| 76 |
rewards: List[float] = []
|
| 77 |
history: List[str] = []
|
| 78 |
steps_taken = 0
|
|
|
|
| 79 |
|
| 80 |
log_start(task=task, env="pytorch-debug-env", model=MODEL_NAME)
|
| 81 |
|
| 82 |
try:
|
| 83 |
async with httpx.AsyncClient(timeout=60.0) as session:
|
| 84 |
-
|
|
|
|
|
|
|
|
|
|
| 85 |
reset_resp.raise_for_status()
|
| 86 |
result = reset_resp.json()
|
| 87 |
|
|
|
|
| 15 |
MAX_STEPS = int(os.environ.get("MAX_STEPS", "5"))
|
| 16 |
SUCCESS_SCORE_THRESHOLD = float(os.environ.get("SUCCESS_SCORE_THRESHOLD", "0.7"))
|
| 17 |
MAX_TOTAL_REWARD = float(os.environ.get("MAX_TOTAL_REWARD", "1.0"))
|
| 18 |
+
SEED = os.environ.get("SEED")
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def _parse_seed(value: str | None) -> int | None:
|
| 22 |
+
if value is None:
|
| 23 |
+
return None
|
| 24 |
+
try:
|
| 25 |
+
return int(value)
|
| 26 |
+
except ValueError:
|
| 27 |
+
return None
|
| 28 |
|
| 29 |
|
| 30 |
def _sanitize_field(value: object) -> str:
|
|
|
|
| 67 |
}}
|
| 68 |
|
| 69 |
Valid action types: reveal_file, extend_loss_curve, extend_gpu_profile, reveal_log_chunk, run_diagnostic
|
| 70 |
+
Valid bug types: missing_zero_grad, data_leakage, memory_leak, learning_rate_too_high, gradient_explosion, wrong_loss_function, amp_overflow
|
| 71 |
|
| 72 |
Observation:
|
| 73 |
{json.dumps(observation)[:8000]}
|
|
|
|
| 86 |
rewards: List[float] = []
|
| 87 |
history: List[str] = []
|
| 88 |
steps_taken = 0
|
| 89 |
+
seed_value = _parse_seed(SEED)
|
| 90 |
|
| 91 |
log_start(task=task, env="pytorch-debug-env", model=MODEL_NAME)
|
| 92 |
|
| 93 |
try:
|
| 94 |
async with httpx.AsyncClient(timeout=60.0) as session:
|
| 95 |
+
reset_params = {"task_id": task}
|
| 96 |
+
if seed_value is not None:
|
| 97 |
+
reset_params["seed"] = seed_value
|
| 98 |
+
reset_resp = await session.post(f"{ENV_URL}/reset", params=reset_params)
|
| 99 |
reset_resp.raise_for_status()
|
| 100 |
result = reset_resp.json()
|
| 101 |
|
src/pytorch_debug_env/bug_library.py
CHANGED
|
@@ -43,15 +43,66 @@ def dummy_artifact_generator(artifact_type: str, rng):
|
|
| 43 |
{"step": int(i), "train_loss": float(base[i] + oscillation[i])}
|
| 44 |
for i in range(100)
|
| 45 |
]
|
| 46 |
-
|
| 47 |
t = np.arange(100)
|
| 48 |
allocated = 2048 + 2.4 * t
|
| 49 |
return [
|
| 50 |
{"step": int(i), "allocated_mb": float(allocated[i])}
|
| 51 |
for i in range(100)
|
| 52 |
]
|
| 53 |
-
|
| 54 |
return "Epoch 1, Step 0: loss 2.45\nEpoch 1, Step 1: loss 2.43\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
return []
|
| 56 |
|
| 57 |
def mutate_missing_zero_grad(repo_files, rng):
|
|
@@ -97,6 +148,52 @@ class ImageDataset(Dataset):
|
|
| 97 |
"""
|
| 98 |
return repo_files
|
| 99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
BUG_TEMPLATES = [
|
| 101 |
BugTemplate(
|
| 102 |
bug_type="missing_zero_grad",
|
|
@@ -111,6 +208,19 @@ BUG_TEMPLATES = [
|
|
| 111 |
artifact_generator=dummy_artifact_generator,
|
| 112 |
repo_mutator=mutate_missing_zero_grad,
|
| 113 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
BugTemplate(
|
| 115 |
bug_type="data_leakage",
|
| 116 |
category="data",
|
|
@@ -124,6 +234,19 @@ BUG_TEMPLATES = [
|
|
| 124 |
artifact_generator=dummy_artifact_generator,
|
| 125 |
repo_mutator=mutate_data_leakage,
|
| 126 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
BugTemplate(
|
| 128 |
bug_type="memory_leak",
|
| 129 |
category="resource",
|
|
@@ -136,5 +259,18 @@ BUG_TEMPLATES = [
|
|
| 136 |
description="Memory leak",
|
| 137 |
artifact_generator=dummy_artifact_generator,
|
| 138 |
repo_mutator=mutate_memory_leak,
|
| 139 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
]
|
|
|
|
| 43 |
{"step": int(i), "train_loss": float(base[i] + oscillation[i])}
|
| 44 |
for i in range(100)
|
| 45 |
]
|
| 46 |
+
if artifact_type == "gpu_profile":
|
| 47 |
t = np.arange(100)
|
| 48 |
allocated = 2048 + 2.4 * t
|
| 49 |
return [
|
| 50 |
{"step": int(i), "allocated_mb": float(allocated[i])}
|
| 51 |
for i in range(100)
|
| 52 |
]
|
| 53 |
+
if artifact_type == "training_log":
|
| 54 |
return "Epoch 1, Step 0: loss 2.45\nEpoch 1, Step 1: loss 2.43\n"
|
| 55 |
+
if artifact_type == "diagnostic_report":
|
| 56 |
+
return "No critical diagnostics found. Review optimizer and data pipeline."
|
| 57 |
+
return []
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def artifact_generator_wrong_loss(artifact_type: str, rng):
|
| 61 |
+
if artifact_type == "loss_curve":
|
| 62 |
+
t = np.arange(100)
|
| 63 |
+
base = 1.8 + 0.05 * np.sin(0.15 * t)
|
| 64 |
+
return [{"step": int(i), "train_loss": float(base[i])} for i in range(100)]
|
| 65 |
+
if artifact_type == "gpu_profile":
|
| 66 |
+
t = np.arange(100)
|
| 67 |
+
allocated = 1900 + 1.8 * t
|
| 68 |
+
return [{"step": int(i), "allocated_mb": float(allocated[i])} for i in range(100)]
|
| 69 |
+
if artifact_type == "training_log":
|
| 70 |
+
return "Epoch 1: loss 1.82, acc 0.11\nEpoch 2: loss 1.80, acc 0.12\n"
|
| 71 |
+
if artifact_type == "diagnostic_report":
|
| 72 |
+
return "Loss plateaus early while accuracy stays near chance. Check loss function."
|
| 73 |
+
return []
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def artifact_generator_lr_high(artifact_type: str, rng):
|
| 77 |
+
if artifact_type == "loss_curve":
|
| 78 |
+
t = np.arange(100)
|
| 79 |
+
base = 0.9 + 0.02 * (t ** 1.1)
|
| 80 |
+
return [{"step": int(i), "train_loss": float(base[i])} for i in range(100)]
|
| 81 |
+
if artifact_type == "gpu_profile":
|
| 82 |
+
t = np.arange(100)
|
| 83 |
+
allocated = 2100 + 2.0 * t
|
| 84 |
+
return [{"step": int(i), "allocated_mb": float(allocated[i])} for i in range(100)]
|
| 85 |
+
if artifact_type == "training_log":
|
| 86 |
+
return "Step 10: loss 3.20 (spike)\nStep 20: loss 5.10 (diverged)\n"
|
| 87 |
+
if artifact_type == "diagnostic_report":
|
| 88 |
+
return "Loss spikes suggest unstable updates. Consider lowering learning rate."
|
| 89 |
+
return []
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def artifact_generator_amp_overflow(artifact_type: str, rng):
|
| 93 |
+
if artifact_type == "loss_curve":
|
| 94 |
+
t = np.arange(100)
|
| 95 |
+
base = 2.1 * np.exp(-0.008 * t) + 0.2
|
| 96 |
+
base[30:] = base[30:] + 0.6
|
| 97 |
+
return [{"step": int(i), "train_loss": float(base[i])} for i in range(100)]
|
| 98 |
+
if artifact_type == "gpu_profile":
|
| 99 |
+
t = np.arange(100)
|
| 100 |
+
allocated = 2300 + 3.2 * t
|
| 101 |
+
return [{"step": int(i), "allocated_mb": float(allocated[i])} for i in range(100)]
|
| 102 |
+
if artifact_type == "training_log":
|
| 103 |
+
return "AMP: overflow detected, skipping step\nAMP: scale reduced to 32768\n"
|
| 104 |
+
if artifact_type == "diagnostic_report":
|
| 105 |
+
return "AMP overflow warnings observed. Ensure GradScaler is used correctly."
|
| 106 |
return []
|
| 107 |
|
| 108 |
def mutate_missing_zero_grad(repo_files, rng):
|
|
|
|
| 148 |
"""
|
| 149 |
return repo_files
|
| 150 |
|
| 151 |
+
|
| 152 |
+
def mutate_wrong_loss_function(repo_files, rng):
|
| 153 |
+
repo_files["train.py"] = """import torch
|
| 154 |
+
from model.architecture import Net
|
| 155 |
+
|
| 156 |
+
model = Net()
|
| 157 |
+
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
|
| 158 |
+
criterion = torch.nn.MSELoss() # BUG: wrong loss for classification
|
| 159 |
+
|
| 160 |
+
for epoch in range(10):
|
| 161 |
+
for x, y in dataloader:
|
| 162 |
+
optimizer.zero_grad()
|
| 163 |
+
output = model(x)
|
| 164 |
+
loss = criterion(output, y)
|
| 165 |
+
loss.backward()
|
| 166 |
+
optimizer.step()
|
| 167 |
+
"""
|
| 168 |
+
return repo_files
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
def mutate_learning_rate_too_high(repo_files, rng):
|
| 172 |
+
repo_files["config/training_config.yaml"] = """lr: 1.0
|
| 173 |
+
batch_size: 32
|
| 174 |
+
"""
|
| 175 |
+
return repo_files
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def mutate_amp_overflow(repo_files, rng):
|
| 179 |
+
repo_files["train.py"] = """import torch
|
| 180 |
+
from model.architecture import Net
|
| 181 |
+
|
| 182 |
+
model = Net().cuda()
|
| 183 |
+
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
|
| 184 |
+
|
| 185 |
+
for epoch in range(10):
|
| 186 |
+
for x, y in dataloader:
|
| 187 |
+
optimizer.zero_grad()
|
| 188 |
+
with torch.cuda.amp.autocast():
|
| 189 |
+
output = model(x.cuda())
|
| 190 |
+
loss = torch.nn.functional.cross_entropy(output, y.cuda())
|
| 191 |
+
# BUG: missing GradScaler handling can cause overflows
|
| 192 |
+
loss.backward()
|
| 193 |
+
optimizer.step()
|
| 194 |
+
"""
|
| 195 |
+
return repo_files
|
| 196 |
+
|
| 197 |
BUG_TEMPLATES = [
|
| 198 |
BugTemplate(
|
| 199 |
bug_type="missing_zero_grad",
|
|
|
|
| 208 |
artifact_generator=dummy_artifact_generator,
|
| 209 |
repo_mutator=mutate_missing_zero_grad,
|
| 210 |
),
|
| 211 |
+
BugTemplate(
|
| 212 |
+
bug_type="wrong_loss_function",
|
| 213 |
+
category="optimization",
|
| 214 |
+
difficulty="easy",
|
| 215 |
+
primary_bug_file="train.py",
|
| 216 |
+
related_files=["config/training_config.yaml"],
|
| 217 |
+
red_herring_file="data/dataset.py",
|
| 218 |
+
fix_strategy="Use CrossEntropyLoss for classification logits",
|
| 219 |
+
line_range=[6, 12],
|
| 220 |
+
description="Wrong loss function",
|
| 221 |
+
artifact_generator=artifact_generator_wrong_loss,
|
| 222 |
+
repo_mutator=mutate_wrong_loss_function,
|
| 223 |
+
),
|
| 224 |
BugTemplate(
|
| 225 |
bug_type="data_leakage",
|
| 226 |
category="data",
|
|
|
|
| 234 |
artifact_generator=dummy_artifact_generator,
|
| 235 |
repo_mutator=mutate_data_leakage,
|
| 236 |
),
|
| 237 |
+
BugTemplate(
|
| 238 |
+
bug_type="learning_rate_too_high",
|
| 239 |
+
category="optimization",
|
| 240 |
+
difficulty="medium",
|
| 241 |
+
primary_bug_file="config/training_config.yaml",
|
| 242 |
+
related_files=["train.py"],
|
| 243 |
+
red_herring_file="model/attention.py",
|
| 244 |
+
fix_strategy="Reduce learning rate or use a scheduler",
|
| 245 |
+
line_range=[1, 1],
|
| 246 |
+
description="Learning rate too high",
|
| 247 |
+
artifact_generator=artifact_generator_lr_high,
|
| 248 |
+
repo_mutator=mutate_learning_rate_too_high,
|
| 249 |
+
),
|
| 250 |
BugTemplate(
|
| 251 |
bug_type="memory_leak",
|
| 252 |
category="resource",
|
|
|
|
| 259 |
description="Memory leak",
|
| 260 |
artifact_generator=dummy_artifact_generator,
|
| 261 |
repo_mutator=mutate_memory_leak,
|
| 262 |
+
),
|
| 263 |
+
BugTemplate(
|
| 264 |
+
bug_type="amp_overflow",
|
| 265 |
+
category="numerics",
|
| 266 |
+
difficulty="hard",
|
| 267 |
+
primary_bug_file="train.py",
|
| 268 |
+
related_files=["config/training_config.yaml"],
|
| 269 |
+
red_herring_file="model/architecture.py",
|
| 270 |
+
fix_strategy="Use GradScaler and scale updates for AMP",
|
| 271 |
+
line_range=[7, 13],
|
| 272 |
+
description="AMP overflow",
|
| 273 |
+
artifact_generator=artifact_generator_amp_overflow,
|
| 274 |
+
repo_mutator=mutate_amp_overflow,
|
| 275 |
+
),
|
| 276 |
]
|
src/pytorch_debug_env/environment.py
CHANGED
|
@@ -10,11 +10,14 @@ from .models import (
|
|
| 10 |
PyTorchDebugObservation,
|
| 11 |
PyTorchDebugState,
|
| 12 |
)
|
| 13 |
-
from .reward import compute_step_reward
|
| 14 |
from .scenario_generator import ScenarioGenerator
|
| 15 |
from .graders import grade_easy, grade_medium, grade_hard
|
| 16 |
|
| 17 |
GRADER_MAP = {"easy": grade_easy, "medium": grade_medium, "hard": grade_hard}
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
|
| 20 |
@dataclass
|
|
@@ -24,29 +27,40 @@ class RuntimeState:
|
|
| 24 |
current_step: int = 0
|
| 25 |
revealed_files: List[str] = field(default_factory=list)
|
| 26 |
hypothesis_history: List[HypothesisRecord] = field(default_factory=list)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
done: bool = False
|
| 28 |
final_score: float = 0.0
|
| 29 |
|
| 30 |
|
| 31 |
class PyTorchDebugEnv:
|
| 32 |
def __init__(self, generator: ScenarioGenerator, max_steps: int = 5):
|
|
|
|
| 33 |
self.generator = generator
|
| 34 |
self.runtime = RuntimeState(max_steps=max_steps)
|
| 35 |
|
| 36 |
-
async def reset(self, task_id: str = "easy"):
|
| 37 |
-
|
|
|
|
| 38 |
self.runtime = RuntimeState(
|
| 39 |
scenario=scenario,
|
| 40 |
max_steps=5 if task_id == "easy" else 6,
|
| 41 |
current_step=0,
|
| 42 |
revealed_files=["train.py", "config/training_config.yaml"],
|
| 43 |
hypothesis_history=[],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
done=False,
|
| 45 |
final_score=0.0,
|
| 46 |
)
|
| 47 |
return self._build_observation(last_feedback="Episode reset.")
|
| 48 |
|
| 49 |
async def step(self, action: PyTorchDebugAction):
|
|
|
|
| 50 |
if self.runtime.scenario is None:
|
| 51 |
raise RuntimeError("Call /reset before /step")
|
| 52 |
|
|
@@ -58,10 +72,23 @@ class PyTorchDebugEnv:
|
|
| 58 |
previous_quality = self.runtime.hypothesis_history[-1].quality if self.runtime.hypothesis_history else 0.0
|
| 59 |
|
| 60 |
investigation_target = None
|
| 61 |
-
if action.investigation_action
|
| 62 |
-
|
| 63 |
-
if
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
committed = action.final_diagnosis.model_dump() if action.commit_diagnosis and action.final_diagnosis else None
|
| 67 |
reward, components = compute_step_reward(
|
|
@@ -73,6 +100,7 @@ class PyTorchDebugEnv:
|
|
| 73 |
step_num=self.runtime.current_step,
|
| 74 |
max_steps=self.runtime.max_steps,
|
| 75 |
)
|
|
|
|
| 76 |
|
| 77 |
if committed:
|
| 78 |
grader = GRADER_MAP.get(scenario.task_id, grade_easy)
|
|
@@ -89,7 +117,7 @@ class PyTorchDebugEnv:
|
|
| 89 |
conf_bonus = components["confirmation_bonus"]
|
| 90 |
|
| 91 |
total = 0.60 * delta + 0.20 * inv_reward + 0.20 * diagnosis_reward + conf_bonus
|
| 92 |
-
reward = round(min(max(total, 0.0), 1.0), 4)
|
| 93 |
|
| 94 |
self.runtime.hypothesis_history.append(
|
| 95 |
HypothesisRecord(
|
|
@@ -114,6 +142,7 @@ class PyTorchDebugEnv:
|
|
| 114 |
}
|
| 115 |
|
| 116 |
async def state(self):
|
|
|
|
| 117 |
scenario = self.runtime.scenario
|
| 118 |
if not scenario:
|
| 119 |
return None
|
|
@@ -126,6 +155,7 @@ class PyTorchDebugEnv:
|
|
| 126 |
remaining_files=[
|
| 127 |
f for f in scenario.repo_files.keys() if f not in self.runtime.revealed_files
|
| 128 |
],
|
|
|
|
| 129 |
done=self.runtime.done,
|
| 130 |
final_score=self.runtime.final_score,
|
| 131 |
)
|
|
@@ -135,10 +165,18 @@ class PyTorchDebugEnv:
|
|
| 135 |
revealed = {k: v for k, v in scenario.repo_files.items() if k in self.runtime.revealed_files}
|
| 136 |
available = [k for k in scenario.repo_files.keys() if k not in self.runtime.revealed_files]
|
| 137 |
|
| 138 |
-
loss_window_size = min(
|
| 139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
log_lines = scenario.training_log.splitlines()
|
| 141 |
-
|
|
|
|
|
|
|
| 142 |
|
| 143 |
return PyTorchDebugObservation(
|
| 144 |
scenario_id=scenario.scenario_id,
|
|
@@ -148,6 +186,7 @@ class PyTorchDebugEnv:
|
|
| 148 |
loss_curve_window=scenario.loss_curve[:loss_window_size],
|
| 149 |
gpu_profile_window=scenario.gpu_profile[:gpu_window_size],
|
| 150 |
training_log_tail=visible_log,
|
|
|
|
| 151 |
step_num=self.runtime.current_step,
|
| 152 |
steps_remaining=max(0, self.runtime.max_steps - self.runtime.current_step),
|
| 153 |
investigation_budget=max(0, self.runtime.max_steps - self.runtime.current_step),
|
|
|
|
| 10 |
PyTorchDebugObservation,
|
| 11 |
PyTorchDebugState,
|
| 12 |
)
|
| 13 |
+
from .reward import clamp_score, compute_step_reward
|
| 14 |
from .scenario_generator import ScenarioGenerator
|
| 15 |
from .graders import grade_easy, grade_medium, grade_hard
|
| 16 |
|
| 17 |
GRADER_MAP = {"easy": grade_easy, "medium": grade_medium, "hard": grade_hard}
|
| 18 |
+
LOSS_WINDOW_STEP = 25
|
| 19 |
+
GPU_WINDOW_STEP = 25
|
| 20 |
+
LOG_WINDOW_STEP = 10
|
| 21 |
|
| 22 |
|
| 23 |
@dataclass
|
|
|
|
| 27 |
current_step: int = 0
|
| 28 |
revealed_files: List[str] = field(default_factory=list)
|
| 29 |
hypothesis_history: List[HypothesisRecord] = field(default_factory=list)
|
| 30 |
+
loss_curve_bonus: int = 0
|
| 31 |
+
gpu_profile_bonus: int = 0
|
| 32 |
+
log_tail_bonus: int = 0
|
| 33 |
+
diagnostic_revealed: bool = False
|
| 34 |
done: bool = False
|
| 35 |
final_score: float = 0.0
|
| 36 |
|
| 37 |
|
| 38 |
class PyTorchDebugEnv:
|
| 39 |
def __init__(self, generator: ScenarioGenerator, max_steps: int = 5):
|
| 40 |
+
"""Create a PyTorch debugging environment with a scenario generator."""
|
| 41 |
self.generator = generator
|
| 42 |
self.runtime = RuntimeState(max_steps=max_steps)
|
| 43 |
|
| 44 |
+
async def reset(self, task_id: str = "easy", seed: int | None = None):
|
| 45 |
+
"""Start a new episode and return the initial observation."""
|
| 46 |
+
scenario = self.generator.generate(task_id, seed=seed)
|
| 47 |
self.runtime = RuntimeState(
|
| 48 |
scenario=scenario,
|
| 49 |
max_steps=5 if task_id == "easy" else 6,
|
| 50 |
current_step=0,
|
| 51 |
revealed_files=["train.py", "config/training_config.yaml"],
|
| 52 |
hypothesis_history=[],
|
| 53 |
+
loss_curve_bonus=0,
|
| 54 |
+
gpu_profile_bonus=0,
|
| 55 |
+
log_tail_bonus=0,
|
| 56 |
+
diagnostic_revealed=False,
|
| 57 |
done=False,
|
| 58 |
final_score=0.0,
|
| 59 |
)
|
| 60 |
return self._build_observation(last_feedback="Episode reset.")
|
| 61 |
|
| 62 |
async def step(self, action: PyTorchDebugAction):
|
| 63 |
+
"""Advance the environment by one step using the provided action."""
|
| 64 |
if self.runtime.scenario is None:
|
| 65 |
raise RuntimeError("Call /reset before /step")
|
| 66 |
|
|
|
|
| 72 |
previous_quality = self.runtime.hypothesis_history[-1].quality if self.runtime.hypothesis_history else 0.0
|
| 73 |
|
| 74 |
investigation_target = None
|
| 75 |
+
if action.investigation_action:
|
| 76 |
+
action_type = action.investigation_action.action
|
| 77 |
+
if action_type == "reveal_file":
|
| 78 |
+
investigation_target = action.investigation_action.target
|
| 79 |
+
if (
|
| 80 |
+
investigation_target in scenario.repo_files
|
| 81 |
+
and investigation_target not in self.runtime.revealed_files
|
| 82 |
+
):
|
| 83 |
+
self.runtime.revealed_files.append(investigation_target)
|
| 84 |
+
elif action_type == "extend_loss_curve":
|
| 85 |
+
self.runtime.loss_curve_bonus += 1
|
| 86 |
+
elif action_type == "extend_gpu_profile":
|
| 87 |
+
self.runtime.gpu_profile_bonus += 1
|
| 88 |
+
elif action_type == "reveal_log_chunk":
|
| 89 |
+
self.runtime.log_tail_bonus += 1
|
| 90 |
+
elif action_type == "run_diagnostic":
|
| 91 |
+
self.runtime.diagnostic_revealed = True
|
| 92 |
|
| 93 |
committed = action.final_diagnosis.model_dump() if action.commit_diagnosis and action.final_diagnosis else None
|
| 94 |
reward, components = compute_step_reward(
|
|
|
|
| 100 |
step_num=self.runtime.current_step,
|
| 101 |
max_steps=self.runtime.max_steps,
|
| 102 |
)
|
| 103 |
+
reward = clamp_score(reward)
|
| 104 |
|
| 105 |
if committed:
|
| 106 |
grader = GRADER_MAP.get(scenario.task_id, grade_easy)
|
|
|
|
| 117 |
conf_bonus = components["confirmation_bonus"]
|
| 118 |
|
| 119 |
total = 0.60 * delta + 0.20 * inv_reward + 0.20 * diagnosis_reward + conf_bonus
|
| 120 |
+
reward = round(clamp_score(min(max(total, 0.0), 1.0)), 4)
|
| 121 |
|
| 122 |
self.runtime.hypothesis_history.append(
|
| 123 |
HypothesisRecord(
|
|
|
|
| 142 |
}
|
| 143 |
|
| 144 |
async def state(self):
|
| 145 |
+
"""Return the current episode state, or None if not started."""
|
| 146 |
scenario = self.runtime.scenario
|
| 147 |
if not scenario:
|
| 148 |
return None
|
|
|
|
| 155 |
remaining_files=[
|
| 156 |
f for f in scenario.repo_files.keys() if f not in self.runtime.revealed_files
|
| 157 |
],
|
| 158 |
+
diagnostic_revealed=self.runtime.diagnostic_revealed,
|
| 159 |
done=self.runtime.done,
|
| 160 |
final_score=self.runtime.final_score,
|
| 161 |
)
|
|
|
|
| 165 |
revealed = {k: v for k, v in scenario.repo_files.items() if k in self.runtime.revealed_files}
|
| 166 |
available = [k for k in scenario.repo_files.keys() if k not in self.runtime.revealed_files]
|
| 167 |
|
| 168 |
+
loss_window_size = min(
|
| 169 |
+
len(scenario.loss_curve),
|
| 170 |
+
LOSS_WINDOW_STEP * (self.runtime.current_step + 1 + self.runtime.loss_curve_bonus),
|
| 171 |
+
)
|
| 172 |
+
gpu_window_size = min(
|
| 173 |
+
len(scenario.gpu_profile),
|
| 174 |
+
GPU_WINDOW_STEP * (self.runtime.current_step + 1 + self.runtime.gpu_profile_bonus),
|
| 175 |
+
)
|
| 176 |
log_lines = scenario.training_log.splitlines()
|
| 177 |
+
log_window = LOG_WINDOW_STEP * (self.runtime.current_step + 1 + self.runtime.log_tail_bonus)
|
| 178 |
+
visible_log = "\n".join(log_lines[-min(len(log_lines), log_window):])
|
| 179 |
+
diagnostic_report = scenario.diagnostic_report if self.runtime.diagnostic_revealed else None
|
| 180 |
|
| 181 |
return PyTorchDebugObservation(
|
| 182 |
scenario_id=scenario.scenario_id,
|
|
|
|
| 186 |
loss_curve_window=scenario.loss_curve[:loss_window_size],
|
| 187 |
gpu_profile_window=scenario.gpu_profile[:gpu_window_size],
|
| 188 |
training_log_tail=visible_log,
|
| 189 |
+
diagnostic_report=diagnostic_report,
|
| 190 |
step_num=self.runtime.current_step,
|
| 191 |
steps_remaining=max(0, self.runtime.max_steps - self.runtime.current_step),
|
| 192 |
investigation_budget=max(0, self.runtime.max_steps - self.runtime.current_step),
|
src/pytorch_debug_env/graders.py
CHANGED
|
@@ -1,21 +1,24 @@
|
|
| 1 |
# src/pytorch_debug_env/graders.py
|
| 2 |
from __future__ import annotations
|
| 3 |
|
| 4 |
-
from .reward import final_diagnosis_score
|
| 5 |
|
| 6 |
|
| 7 |
def grade_easy(action: dict, gt: dict) -> float:
|
| 8 |
-
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
def grade_medium(action: dict, gt: dict) -> float:
|
|
|
|
| 12 |
score = final_diagnosis_score(action, gt)
|
| 13 |
if action.get("affected_file") in gt.get("related_files", []):
|
| 14 |
score = min(1.0, score + 0.05)
|
| 15 |
-
return round(score, 4)
|
| 16 |
|
| 17 |
|
| 18 |
def grade_hard(action: dict, gt: dict) -> float:
|
|
|
|
| 19 |
score = final_diagnosis_score(action, gt)
|
| 20 |
|
| 21 |
# partial credit if model gets the right category on subtle bugs
|
|
@@ -28,4 +31,4 @@ def grade_hard(action: dict, gt: dict) -> float:
|
|
| 28 |
if action.get("affected_file") == gt.get("red_herring_file"):
|
| 29 |
score = max(0.0, score - 0.1)
|
| 30 |
|
| 31 |
-
return round(min(score, 1.0), 4)
|
|
|
|
| 1 |
# src/pytorch_debug_env/graders.py
|
| 2 |
from __future__ import annotations
|
| 3 |
|
| 4 |
+
from .reward import clamp_score, final_diagnosis_score
|
| 5 |
|
| 6 |
|
| 7 |
def grade_easy(action: dict, gt: dict) -> float:
|
| 8 |
+
"""Easy grader: strict match on the core diagnosis fields."""
|
| 9 |
+
return clamp_score(final_diagnosis_score(action, gt))
|
| 10 |
|
| 11 |
|
| 12 |
def grade_medium(action: dict, gt: dict) -> float:
|
| 13 |
+
"""Medium grader: add small credit for related-file hypotheses."""
|
| 14 |
score = final_diagnosis_score(action, gt)
|
| 15 |
if action.get("affected_file") in gt.get("related_files", []):
|
| 16 |
score = min(1.0, score + 0.05)
|
| 17 |
+
return round(clamp_score(score), 4)
|
| 18 |
|
| 19 |
|
| 20 |
def grade_hard(action: dict, gt: dict) -> float:
|
| 21 |
+
"""Hard grader: allow category credit, penalize red herrings."""
|
| 22 |
score = final_diagnosis_score(action, gt)
|
| 23 |
|
| 24 |
# partial credit if model gets the right category on subtle bugs
|
|
|
|
| 31 |
if action.get("affected_file") == gt.get("red_herring_file"):
|
| 32 |
score = max(0.0, score - 0.1)
|
| 33 |
|
| 34 |
+
return round(clamp_score(min(score, 1.0)), 4)
|
src/pytorch_debug_env/models.py
CHANGED
|
@@ -51,6 +51,7 @@ class PyTorchDebugObservation(BaseModel):
|
|
| 51 |
loss_curve_window: List[Dict]
|
| 52 |
gpu_profile_window: List[Dict]
|
| 53 |
training_log_tail: str
|
|
|
|
| 54 |
step_num: int
|
| 55 |
steps_remaining: int
|
| 56 |
investigation_budget: int
|
|
@@ -65,6 +66,7 @@ class PyTorchDebugState(BaseModel):
|
|
| 65 |
current_step: int
|
| 66 |
revealed_files: List[str]
|
| 67 |
remaining_files: List[str]
|
|
|
|
| 68 |
done: bool
|
| 69 |
final_score: float = 0.0
|
| 70 |
|
|
|
|
| 51 |
loss_curve_window: List[Dict]
|
| 52 |
gpu_profile_window: List[Dict]
|
| 53 |
training_log_tail: str
|
| 54 |
+
diagnostic_report: Optional[str] = None
|
| 55 |
step_num: int
|
| 56 |
steps_remaining: int
|
| 57 |
investigation_budget: int
|
|
|
|
| 66 |
current_step: int
|
| 67 |
revealed_files: List[str]
|
| 68 |
remaining_files: List[str]
|
| 69 |
+
diagnostic_revealed: bool = False
|
| 70 |
done: bool
|
| 71 |
final_score: float = 0.0
|
| 72 |
|
src/pytorch_debug_env/reward.py
CHANGED
|
@@ -3,26 +3,35 @@ from __future__ import annotations
|
|
| 3 |
|
| 4 |
from .bug_library import BUG_CATEGORIES
|
| 5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
def hypothesis_quality(hypothesis: dict, ground_truth: dict) -> float:
|
| 8 |
-
|
|
|
|
| 9 |
|
| 10 |
if hypothesis.get("affected_file") == ground_truth["primary_bug_file"]:
|
| 11 |
-
|
| 12 |
elif hypothesis.get("affected_file") in ground_truth.get("related_files", []):
|
| 13 |
-
|
| 14 |
|
| 15 |
if hypothesis.get("bug_type") == ground_truth["bug_type"]:
|
| 16 |
-
|
| 17 |
elif BUG_CATEGORIES.get(hypothesis.get("bug_type")) == BUG_CATEGORIES.get(ground_truth["bug_type"]):
|
| 18 |
-
|
| 19 |
|
| 20 |
-
calibration = 1.0 - abs(hypothesis.get("confidence", 0.5) - min(
|
| 21 |
-
|
| 22 |
-
return round(min(
|
| 23 |
|
| 24 |
|
| 25 |
def final_diagnosis_score(diagnosis: dict, ground_truth: dict) -> float:
|
|
|
|
| 26 |
score = 0.0
|
| 27 |
|
| 28 |
if diagnosis.get("bug_type") == ground_truth["bug_type"]:
|
|
@@ -38,10 +47,11 @@ def final_diagnosis_score(diagnosis: dict, ground_truth: dict) -> float:
|
|
| 38 |
if diagnosis.get("fix_strategy") == ground_truth["fix_strategy"]:
|
| 39 |
score += 0.15
|
| 40 |
|
| 41 |
-
return round(min(score, 1.0), 4)
|
| 42 |
|
| 43 |
|
| 44 |
def line_overlap(pred: list[int], actual: list[int]) -> float:
|
|
|
|
| 45 |
p1, p2 = pred
|
| 46 |
a1, a2 = actual
|
| 47 |
inter = max(0, min(p2, a2) - max(p1, a1) + 1)
|
|
@@ -58,6 +68,7 @@ def compute_step_reward(
|
|
| 58 |
step_num: int = 1,
|
| 59 |
max_steps: int = 5,
|
| 60 |
) -> tuple[float, dict]:
|
|
|
|
| 61 |
current_quality = hypothesis_quality(current_hypothesis, ground_truth)
|
| 62 |
delta = current_quality - previous_quality
|
| 63 |
|
|
@@ -81,7 +92,7 @@ def compute_step_reward(
|
|
| 81 |
diagnosis_reward += max(0.0, 0.08 * (max_steps - step_num))
|
| 82 |
|
| 83 |
total = 0.60 * delta + 0.20 * investigation_reward + 0.20 * diagnosis_reward + confirmation_bonus
|
| 84 |
-
total = round(min(max(total, 0.0), 1.0), 4)
|
| 85 |
|
| 86 |
return total, {
|
| 87 |
"hypothesis_quality": current_quality,
|
|
|
|
| 3 |
|
| 4 |
from .bug_library import BUG_CATEGORIES
|
| 5 |
|
| 6 |
+
EPSILON = 1e-3
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def clamp_score(value: float) -> float:
|
| 10 |
+
"""Clamp scores to the open interval (0, 1) for validator compliance."""
|
| 11 |
+
return min(max(value, EPSILON), 1.0 - EPSILON)
|
| 12 |
+
|
| 13 |
|
| 14 |
def hypothesis_quality(hypothesis: dict, ground_truth: dict) -> float:
|
| 15 |
+
"""Score how well the current hypothesis matches the ground truth."""
|
| 16 |
+
quality = 0.0
|
| 17 |
|
| 18 |
if hypothesis.get("affected_file") == ground_truth["primary_bug_file"]:
|
| 19 |
+
quality += 0.45
|
| 20 |
elif hypothesis.get("affected_file") in ground_truth.get("related_files", []):
|
| 21 |
+
quality += 0.15
|
| 22 |
|
| 23 |
if hypothesis.get("bug_type") == ground_truth["bug_type"]:
|
| 24 |
+
quality += 0.40
|
| 25 |
elif BUG_CATEGORIES.get(hypothesis.get("bug_type")) == BUG_CATEGORIES.get(ground_truth["bug_type"]):
|
| 26 |
+
quality += 0.13
|
| 27 |
|
| 28 |
+
calibration = 1.0 - abs(hypothesis.get("confidence", 0.5) - min(quality, 1.0))
|
| 29 |
+
quality += 0.15 * calibration
|
| 30 |
+
return round(min(quality, 1.0), 4)
|
| 31 |
|
| 32 |
|
| 33 |
def final_diagnosis_score(diagnosis: dict, ground_truth: dict) -> float:
|
| 34 |
+
"""Score the committed diagnosis against the ground truth."""
|
| 35 |
score = 0.0
|
| 36 |
|
| 37 |
if diagnosis.get("bug_type") == ground_truth["bug_type"]:
|
|
|
|
| 47 |
if diagnosis.get("fix_strategy") == ground_truth["fix_strategy"]:
|
| 48 |
score += 0.15
|
| 49 |
|
| 50 |
+
return round(clamp_score(min(score, 1.0)), 4)
|
| 51 |
|
| 52 |
|
| 53 |
def line_overlap(pred: list[int], actual: list[int]) -> float:
|
| 54 |
+
"""Compute overlap ratio between two line ranges."""
|
| 55 |
p1, p2 = pred
|
| 56 |
a1, a2 = actual
|
| 57 |
inter = max(0, min(p2, a2) - max(p1, a1) + 1)
|
|
|
|
| 68 |
step_num: int = 1,
|
| 69 |
max_steps: int = 5,
|
| 70 |
) -> tuple[float, dict]:
|
| 71 |
+
"""Compute step-level reward and diagnostic components."""
|
| 72 |
current_quality = hypothesis_quality(current_hypothesis, ground_truth)
|
| 73 |
delta = current_quality - previous_quality
|
| 74 |
|
|
|
|
| 92 |
diagnosis_reward += max(0.0, 0.08 * (max_steps - step_num))
|
| 93 |
|
| 94 |
total = 0.60 * delta + 0.20 * investigation_reward + 0.20 * diagnosis_reward + confirmation_bonus
|
| 95 |
+
total = round(clamp_score(min(max(total, 0.0), 1.0)), 4)
|
| 96 |
|
| 97 |
return total, {
|
| 98 |
"hypothesis_quality": current_quality,
|
src/pytorch_debug_env/scenario_generator.py
CHANGED
|
@@ -19,14 +19,17 @@ class Scenario:
|
|
| 19 |
loss_curve: List[Dict]
|
| 20 |
gpu_profile: List[Dict]
|
| 21 |
training_log: str
|
|
|
|
| 22 |
ground_truth: Dict
|
| 23 |
|
| 24 |
|
| 25 |
class ScenarioGenerator:
|
| 26 |
def __init__(self, bug_templates: List[BugTemplate]):
|
|
|
|
| 27 |
self.bug_templates = bug_templates
|
| 28 |
|
| 29 |
def generate(self, difficulty: str, seed: int | None = None) -> Scenario:
|
|
|
|
| 30 |
rng = random.Random(seed)
|
| 31 |
candidates = [b for b in self.bug_templates if b.difficulty == difficulty]
|
| 32 |
if not candidates:
|
|
@@ -39,6 +42,7 @@ class ScenarioGenerator:
|
|
| 39 |
loss_curve = template.artifact_generator("loss_curve", rng)
|
| 40 |
gpu_profile = template.artifact_generator("gpu_profile", rng)
|
| 41 |
training_log = template.artifact_generator("training_log", rng)
|
|
|
|
| 42 |
|
| 43 |
ground_truth = {
|
| 44 |
"bug_type": template.bug_type,
|
|
@@ -57,6 +61,7 @@ class ScenarioGenerator:
|
|
| 57 |
loss_curve=loss_curve,
|
| 58 |
gpu_profile=gpu_profile,
|
| 59 |
training_log=training_log,
|
|
|
|
| 60 |
ground_truth=ground_truth,
|
| 61 |
)
|
| 62 |
|
|
|
|
| 19 |
loss_curve: List[Dict]
|
| 20 |
gpu_profile: List[Dict]
|
| 21 |
training_log: str
|
| 22 |
+
diagnostic_report: str
|
| 23 |
ground_truth: Dict
|
| 24 |
|
| 25 |
|
| 26 |
class ScenarioGenerator:
|
| 27 |
def __init__(self, bug_templates: List[BugTemplate]):
|
| 28 |
+
"""Create a generator that samples from a set of bug templates."""
|
| 29 |
self.bug_templates = bug_templates
|
| 30 |
|
| 31 |
def generate(self, difficulty: str, seed: int | None = None) -> Scenario:
|
| 32 |
+
"""Build a scenario with deterministic artifacts when a seed is provided."""
|
| 33 |
rng = random.Random(seed)
|
| 34 |
candidates = [b for b in self.bug_templates if b.difficulty == difficulty]
|
| 35 |
if not candidates:
|
|
|
|
| 42 |
loss_curve = template.artifact_generator("loss_curve", rng)
|
| 43 |
gpu_profile = template.artifact_generator("gpu_profile", rng)
|
| 44 |
training_log = template.artifact_generator("training_log", rng)
|
| 45 |
+
diagnostic_report = template.artifact_generator("diagnostic_report", rng)
|
| 46 |
|
| 47 |
ground_truth = {
|
| 48 |
"bug_type": template.bug_type,
|
|
|
|
| 61 |
loss_curve=loss_curve,
|
| 62 |
gpu_profile=gpu_profile,
|
| 63 |
training_log=training_log,
|
| 64 |
+
diagnostic_report=diagnostic_report,
|
| 65 |
ground_truth=ground_truth,
|
| 66 |
)
|
| 67 |
|
src/pytorch_debug_env/server.py
CHANGED
|
@@ -27,13 +27,13 @@ async def health():
|
|
| 27 |
|
| 28 |
|
| 29 |
@app.post("/reset")
|
| 30 |
-
async def reset(task_id: str = "easy"):
|
| 31 |
global latest_session_id
|
| 32 |
session_id = str(uuid4())
|
| 33 |
env = PyTorchDebugEnv(generator=ScenarioGenerator(BUG_TEMPLATES))
|
| 34 |
sessions[session_id] = env
|
| 35 |
latest_session_id = session_id
|
| 36 |
-
obs = await env.reset(task_id=task_id)
|
| 37 |
return {"session_id": session_id, "observation": obs, "done": False}
|
| 38 |
|
| 39 |
|
|
|
|
| 27 |
|
| 28 |
|
| 29 |
@app.post("/reset")
|
| 30 |
+
async def reset(task_id: str = "easy", seed: int | None = None):
|
| 31 |
global latest_session_id
|
| 32 |
session_id = str(uuid4())
|
| 33 |
env = PyTorchDebugEnv(generator=ScenarioGenerator(BUG_TEMPLATES))
|
| 34 |
sessions[session_id] = env
|
| 35 |
latest_session_id = session_id
|
| 36 |
+
obs = await env.reset(task_id=task_id, seed=seed)
|
| 37 |
return {"session_id": session_id, "observation": obs, "done": False}
|
| 38 |
|
| 39 |
|
tests/test_environment_edge_cases.py
CHANGED
|
@@ -87,7 +87,7 @@ async def test_reward_range_and_info_keys():
|
|
| 87 |
),
|
| 88 |
)
|
| 89 |
result = await env.step(action)
|
| 90 |
-
assert 0.0 <
|
| 91 |
for key in (
|
| 92 |
"hypothesis_quality",
|
| 93 |
"hypothesis_delta",
|
|
@@ -96,3 +96,69 @@ async def test_reward_range_and_info_keys():
|
|
| 96 |
"confirmation_bonus",
|
| 97 |
):
|
| 98 |
assert key in result["info"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
),
|
| 88 |
)
|
| 89 |
result = await env.step(action)
|
| 90 |
+
assert 0.0 < result["reward"] < 1.0
|
| 91 |
for key in (
|
| 92 |
"hypothesis_quality",
|
| 93 |
"hypothesis_delta",
|
|
|
|
| 96 |
"confirmation_bonus",
|
| 97 |
):
|
| 98 |
assert key in result["info"]
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
@pytest.mark.asyncio
|
| 102 |
+
async def test_extend_loss_curve_increases_window():
|
| 103 |
+
env = make_env()
|
| 104 |
+
await env.reset("easy", seed=123)
|
| 105 |
+
action = PyTorchDebugAction(
|
| 106 |
+
current_hypothesis=base_hypothesis(),
|
| 107 |
+
investigation_action=InvestigationAction(action="extend_loss_curve"),
|
| 108 |
+
)
|
| 109 |
+
extended = await env.step(action)
|
| 110 |
+
extended_len = len(extended["observation"].loss_curve_window)
|
| 111 |
+
|
| 112 |
+
env_base = make_env()
|
| 113 |
+
await env_base.reset("easy", seed=123)
|
| 114 |
+
base = await env_base.step(PyTorchDebugAction(current_hypothesis=base_hypothesis()))
|
| 115 |
+
base_len = len(base["observation"].loss_curve_window)
|
| 116 |
+
assert extended_len > base_len
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
@pytest.mark.asyncio
|
| 120 |
+
async def test_extend_gpu_profile_increases_window():
|
| 121 |
+
env = make_env()
|
| 122 |
+
await env.reset("easy", seed=321)
|
| 123 |
+
action = PyTorchDebugAction(
|
| 124 |
+
current_hypothesis=base_hypothesis(),
|
| 125 |
+
investigation_action=InvestigationAction(action="extend_gpu_profile"),
|
| 126 |
+
)
|
| 127 |
+
extended = await env.step(action)
|
| 128 |
+
extended_len = len(extended["observation"].gpu_profile_window)
|
| 129 |
+
|
| 130 |
+
env_base = make_env()
|
| 131 |
+
await env_base.reset("easy", seed=321)
|
| 132 |
+
base = await env_base.step(PyTorchDebugAction(current_hypothesis=base_hypothesis()))
|
| 133 |
+
base_len = len(base["observation"].gpu_profile_window)
|
| 134 |
+
assert extended_len > base_len
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
@pytest.mark.asyncio
|
| 138 |
+
async def test_reveal_log_chunk_extends_tail():
|
| 139 |
+
env = make_env()
|
| 140 |
+
await env.reset("easy", seed=77)
|
| 141 |
+
action = PyTorchDebugAction(
|
| 142 |
+
current_hypothesis=base_hypothesis(),
|
| 143 |
+
investigation_action=InvestigationAction(action="reveal_log_chunk"),
|
| 144 |
+
)
|
| 145 |
+
extended = await env.step(action)
|
| 146 |
+
extended_len = len(extended["observation"].training_log_tail)
|
| 147 |
+
|
| 148 |
+
env_base = make_env()
|
| 149 |
+
await env_base.reset("easy", seed=77)
|
| 150 |
+
base = await env_base.step(PyTorchDebugAction(current_hypothesis=base_hypothesis()))
|
| 151 |
+
base_len = len(base["observation"].training_log_tail)
|
| 152 |
+
assert extended_len >= base_len
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
@pytest.mark.asyncio
|
| 156 |
+
async def test_run_diagnostic_exposes_report():
|
| 157 |
+
env = make_env()
|
| 158 |
+
await env.reset("easy", seed=11)
|
| 159 |
+
action = PyTorchDebugAction(
|
| 160 |
+
current_hypothesis=base_hypothesis(),
|
| 161 |
+
investigation_action=InvestigationAction(action="run_diagnostic"),
|
| 162 |
+
)
|
| 163 |
+
result = await env.step(action)
|
| 164 |
+
assert result["observation"].diagnostic_report
|
tests/test_graders.py
CHANGED
|
@@ -16,7 +16,9 @@ def test_grade_easy():
|
|
| 16 |
"fix_strategy": "Call optimizer.zero_grad() before loss.backward()",
|
| 17 |
"confidence": 0.8
|
| 18 |
}
|
| 19 |
-
|
|
|
|
|
|
|
| 20 |
|
| 21 |
|
| 22 |
def test_grade_medium_related_file_bonus():
|
|
@@ -34,7 +36,9 @@ def test_grade_medium_related_file_bonus():
|
|
| 34 |
"fix_strategy": "Ensure validation split is strictly separate from training",
|
| 35 |
"confidence": 0.6,
|
| 36 |
}
|
| 37 |
-
|
|
|
|
|
|
|
| 38 |
|
| 39 |
|
| 40 |
def test_grade_hard_category_partial_credit():
|
|
@@ -54,7 +58,9 @@ def test_grade_hard_category_partial_credit():
|
|
| 54 |
"fix_strategy": "Use CrossEntropyLoss instead of MSE",
|
| 55 |
"confidence": 0.5,
|
| 56 |
}
|
| 57 |
-
|
|
|
|
|
|
|
| 58 |
|
| 59 |
|
| 60 |
def test_grade_hard_penalizes_red_herring():
|
|
@@ -76,3 +82,4 @@ def test_grade_hard_penalizes_red_herring():
|
|
| 76 |
}
|
| 77 |
penalized = grade_hard(action, gt)
|
| 78 |
assert penalized <= 0.9
|
|
|
|
|
|
| 16 |
"fix_strategy": "Call optimizer.zero_grad() before loss.backward()",
|
| 17 |
"confidence": 0.8
|
| 18 |
}
|
| 19 |
+
score = grade_easy(action, gt)
|
| 20 |
+
assert score > 0.8
|
| 21 |
+
assert score < 1.0
|
| 22 |
|
| 23 |
|
| 24 |
def test_grade_medium_related_file_bonus():
|
|
|
|
| 36 |
"fix_strategy": "Ensure validation split is strictly separate from training",
|
| 37 |
"confidence": 0.6,
|
| 38 |
}
|
| 39 |
+
score = grade_medium(action, gt)
|
| 40 |
+
assert score >= grade_easy(action, gt)
|
| 41 |
+
assert 0.0 < score < 1.0
|
| 42 |
|
| 43 |
|
| 44 |
def test_grade_hard_category_partial_credit():
|
|
|
|
| 58 |
"fix_strategy": "Use CrossEntropyLoss instead of MSE",
|
| 59 |
"confidence": 0.5,
|
| 60 |
}
|
| 61 |
+
score = grade_hard(action, gt)
|
| 62 |
+
assert score >= 0.18
|
| 63 |
+
assert 0.0 < score < 1.0
|
| 64 |
|
| 65 |
|
| 66 |
def test_grade_hard_penalizes_red_herring():
|
|
|
|
| 82 |
}
|
| 83 |
penalized = grade_hard(action, gt)
|
| 84 |
assert penalized <= 0.9
|
| 85 |
+
assert 0.0 < penalized < 1.0
|
tests/test_reward.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
# tests/test_reward.py
|
| 2 |
from src.pytorch_debug_env.reward import (
|
|
|
|
| 3 |
compute_step_reward,
|
| 4 |
final_diagnosis_score,
|
| 5 |
hypothesis_quality,
|
|
@@ -39,7 +40,8 @@ def test_final_diagnosis_score_bounds():
|
|
| 39 |
"line_range": [10, 12],
|
| 40 |
"fix_strategy": "Call optimizer.zero_grad() before loss.backward()",
|
| 41 |
}
|
| 42 |
-
|
|
|
|
| 43 |
|
| 44 |
|
| 45 |
def test_compute_step_reward_clamps_non_negative():
|
|
@@ -65,5 +67,10 @@ def test_compute_step_reward_clamps_non_negative():
|
|
| 65 |
step_num=1,
|
| 66 |
max_steps=5,
|
| 67 |
)
|
| 68 |
-
assert reward
|
| 69 |
assert components["investigation_reward"] <= 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# tests/test_reward.py
|
| 2 |
from src.pytorch_debug_env.reward import (
|
| 3 |
+
clamp_score,
|
| 4 |
compute_step_reward,
|
| 5 |
final_diagnosis_score,
|
| 6 |
hypothesis_quality,
|
|
|
|
| 40 |
"line_range": [10, 12],
|
| 41 |
"fix_strategy": "Call optimizer.zero_grad() before loss.backward()",
|
| 42 |
}
|
| 43 |
+
score = final_diagnosis_score(action, gt)
|
| 44 |
+
assert 0.0 < score < 1.0
|
| 45 |
|
| 46 |
|
| 47 |
def test_compute_step_reward_clamps_non_negative():
|
|
|
|
| 67 |
step_num=1,
|
| 68 |
max_steps=5,
|
| 69 |
)
|
| 70 |
+
assert 0.0 < reward < 1.0
|
| 71 |
assert components["investigation_reward"] <= 0.0
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def test_clamp_score_open_interval():
|
| 75 |
+
assert 0.0 < clamp_score(0.0) < 1.0
|
| 76 |
+
assert 0.0 < clamp_score(1.0) < 1.0
|
tests/test_scenario_generator.py
CHANGED
|
@@ -8,3 +8,12 @@ def test_generate_invalid_difficulty_raises():
|
|
| 8 |
generator = ScenarioGenerator(BUG_TEMPLATES)
|
| 9 |
with pytest.raises(ValueError):
|
| 10 |
generator.generate("unknown")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
generator = ScenarioGenerator(BUG_TEMPLATES)
|
| 9 |
with pytest.raises(ValueError):
|
| 10 |
generator.generate("unknown")
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def test_generate_seed_reproducibility():
|
| 14 |
+
generator = ScenarioGenerator(BUG_TEMPLATES)
|
| 15 |
+
first = generator.generate("easy", seed=123)
|
| 16 |
+
second = generator.generate("easy", seed=123)
|
| 17 |
+
assert first.ground_truth == second.ground_truth
|
| 18 |
+
assert first.repo_files == second.repo_files
|
| 19 |
+
assert first.training_log == second.training_log
|