Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- core/__init__.py +3 -2
- core/grading/grader.py +143 -0
- core/pipeline/stage_runner.py +396 -0
- core/scenarios/generator.py +433 -0
- core/utils/packages.py +23 -0
- core/validation/parser.py +64 -0
- core/validation/validator.py +43 -0
- models.py +1 -1
- openenv.yaml +3 -3
- openenv_CI_CD_Doctor.egg-info/SOURCES.txt +12 -8
- server/environment.py +4 -4
core/__init__.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
"""CI/CD Doctor environment subpackage."""
|
| 2 |
|
| 3 |
-
from .grader import grade
|
|
|
|
| 4 |
|
| 5 |
-
__all__ = [ "grade"]
|
|
|
|
| 1 |
"""CI/CD Doctor environment subpackage."""
|
| 2 |
|
| 3 |
+
from .grading.grader import grade
|
| 4 |
+
from .utils.packages import get_packages
|
| 5 |
|
| 6 |
+
__all__ = [ "grade","get_packages"]
|
core/grading/grader.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Grader for the CI/CD Doctor environment.
|
| 3 |
+
|
| 4 |
+
Reward shape:
|
| 5 |
+
fixes_applied_fraction * 0.35 proportional credit for each answer_key fix
|
| 6 |
+
that is present in the filesystem (emitted
|
| 7 |
+
incrementally as each fix lands, not all-or-
|
| 8 |
+
nothing — on a 2-fix task, each fix is worth
|
| 9 |
+
+0.175)
|
| 10 |
+
pipeline_passed +0.50 pipeline_status == "passed" (terminal)
|
| 11 |
+
|
| 12 |
+
Total positive: 0.85 from grade() + shaped bonuses from balance_score().
|
| 13 |
+
|
| 14 |
+
Investigation milestones (investigated, logs_read, correct_file_located) are
|
| 15 |
+
still tracked in state.milestones for the balance_score() logic but give no
|
| 16 |
+
reward — reading a file is not progress, fixing it is.
|
| 17 |
+
|
| 18 |
+
balance_score() applies per-step shaped adjustments on top of the tier delta:
|
| 19 |
+
+0.05 First read of each answer-key file (exploration bonus, max 2 files)
|
| 20 |
+
-0.05 cat on a file already read this episode (redundant read penalty)
|
| 21 |
+
-0.10 pipeline run with no filesystem changes since last run (idle run)
|
| 22 |
+
-0.01 * overage each step taken beyond the task's ideal step count
|
| 23 |
+
(efficiency penalty scales linearly with how far past ideal — at
|
| 24 |
+
ideal+1 it's -0.01, at ideal+5 it's -0.05; cumulative cost on a
|
| 25 |
+
9-step overage tops out around -0.45)
|
| 26 |
+
-0.08 agent has read the correct file but runs pipeline again with no edit
|
| 27 |
+
(exploitation trap — knows the problem, not acting on it)
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
from dataclasses import dataclass, field
|
| 31 |
+
|
| 32 |
+
from models import PipelineState
|
| 33 |
+
|
| 34 |
+
CORRECT_FILE_EDITED_TOTAL = 0.2
|
| 35 |
+
|
| 36 |
+
TIER_REWARDS: dict[str, float] = {
|
| 37 |
+
"investigated": 0.0,
|
| 38 |
+
"logs_read": 0.0,
|
| 39 |
+
"correct_file_located": 0.01,
|
| 40 |
+
"pipeline_passed": 0.50,
|
| 41 |
+
"optimal_step":0.05
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
PENALTIES: dict[str, float] = {
|
| 45 |
+
"idle_pipeline_run": -0.10,
|
| 46 |
+
"redundant_read": -0.05,
|
| 47 |
+
"over_ideal_step": -0.01,
|
| 48 |
+
"exploitation_trap": -0.08,
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
@dataclass
|
| 52 |
+
class StepContext:
|
| 53 |
+
cmd_type: str
|
| 54 |
+
filename: str | None = None
|
| 55 |
+
files_read: set[str] = field(default_factory=set)
|
| 56 |
+
fs_changed_since_last_run: bool = True
|
| 57 |
+
step_count: int = 0
|
| 58 |
+
max_steps: int = 15
|
| 59 |
+
ideal_steps: int = 6
|
| 60 |
+
pipeline_runs_since_last_edit: int = 0
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def _fixes_applied_fraction(state: PipelineState) -> float:
|
| 64 |
+
"""
|
| 65 |
+
Fraction of answer_key fixes that are currently present in the filesystem.
|
| 66 |
+
Returns a value in [0.0, 1.0]. Each fix contributes incrementally the
|
| 67 |
+
moment its fragment appears in the target file, so a 2-fix task rewards
|
| 68 |
+
each correct edit as it happens rather than only when both are done.
|
| 69 |
+
"""
|
| 70 |
+
fixes = state.answer_key.get("fixes", {})
|
| 71 |
+
if not fixes:
|
| 72 |
+
return 0.0
|
| 73 |
+
applied = sum(
|
| 74 |
+
1 for filename, fragment in fixes.items()
|
| 75 |
+
if fragment in state.filesystem.get(filename, "")
|
| 76 |
+
)
|
| 77 |
+
return applied / len(fixes)
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def grade(state: PipelineState) -> float:
|
| 81 |
+
"""
|
| 82 |
+
Compute the total earned grade from state. Fractional credit for fixes
|
| 83 |
+
in the filesystem, plus the terminal bonus on pipeline pass. Investigation
|
| 84 |
+
milestones contribute 0 — reading a file is not progress, fixing it is.
|
| 85 |
+
"""
|
| 86 |
+
score = CORRECT_FILE_EDITED_TOTAL * _fixes_applied_fraction(state)
|
| 87 |
+
|
| 88 |
+
unlocked = set(state.milestones)
|
| 89 |
+
if state.pipeline_status == "passed":
|
| 90 |
+
unlocked.add("pipeline_passed")
|
| 91 |
+
score += sum(TIER_REWARDS[tier] for tier in unlocked if tier in TIER_REWARDS)
|
| 92 |
+
|
| 93 |
+
return round(score, 2)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def balance_score(state: PipelineState, ctx: StepContext) -> float:
|
| 97 |
+
"""
|
| 98 |
+
Per-step shaped reward adjustment on top of the raw grade delta.
|
| 99 |
+
|
| 100 |
+
Returns a float (may be negative). The caller adds this to the grade
|
| 101 |
+
delta to produce the final step reward.
|
| 102 |
+
|
| 103 |
+
The two goals:
|
| 104 |
+
- Encourage exploration: small bonus the first time the agent reads a
|
| 105 |
+
file that needs fixing (up to 2 files per episode).
|
| 106 |
+
- Discourage waste: penalties for re-reading, idle pipeline runs,
|
| 107 |
+
burning the step budget, and knowing the fix but not applying it.
|
| 108 |
+
"""
|
| 109 |
+
adjustment = 0.0
|
| 110 |
+
fix_files = set(state.answer_key.get("fixes", {}).keys())
|
| 111 |
+
|
| 112 |
+
if ctx.cmd_type == "cat" and ctx.filename:
|
| 113 |
+
if ctx.filename in fix_files and ctx.filename not in ctx.files_read:
|
| 114 |
+
# First read of a file that needs fixing — exploration bonus.
|
| 115 |
+
# Cap at 2 files total to avoid rewarding excessive exploration.
|
| 116 |
+
already_explored = sum(1 for f in ctx.files_read if f in fix_files)
|
| 117 |
+
if already_explored < 2:
|
| 118 |
+
adjustment += 0.05
|
| 119 |
+
elif ctx.filename in ctx.files_read:
|
| 120 |
+
# Already read this file — wasted step.
|
| 121 |
+
adjustment += PENALTIES["redundant_read"]
|
| 122 |
+
|
| 123 |
+
if ctx.cmd_type == "pipeline_run":
|
| 124 |
+
if not ctx.fs_changed_since_last_run:
|
| 125 |
+
# Nothing changed since the last run — this reveals no new info.
|
| 126 |
+
adjustment += PENALTIES["idle_pipeline_run"]
|
| 127 |
+
|
| 128 |
+
if (
|
| 129 |
+
"correct_file_located" in state.milestones
|
| 130 |
+
and ctx.pipeline_runs_since_last_edit >= 1
|
| 131 |
+
):
|
| 132 |
+
# Agent has already read the right file and run the pipeline at
|
| 133 |
+
# least once since its last edit — it knows what to fix but is
|
| 134 |
+
# stalling instead of applying the fix.
|
| 135 |
+
adjustment += PENALTIES["exploitation_trap"]
|
| 136 |
+
|
| 137 |
+
if ctx.step_count > ctx.ideal_steps:
|
| 138 |
+
overage = ctx.step_count - ctx.ideal_steps
|
| 139 |
+
adjustment += PENALTIES["over_ideal_step"] * overage
|
| 140 |
+
else:
|
| 141 |
+
adjustment += TIER_REWARDS["optimal_step"]
|
| 142 |
+
|
| 143 |
+
return round(adjustment, 2)
|
core/pipeline/stage_runner.py
ADDED
|
@@ -0,0 +1,396 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Simulated pipeline stage runner for the CI/CD Doctor environment.
|
| 3 |
+
No real pip or subprocess — pure deterministic logic.
|
| 4 |
+
|
| 5 |
+
Error message design principle: report WHICH stage failed and WHICH file to
|
| 6 |
+
inspect, but NOT the exact value that is wrong. The agent must read the file
|
| 7 |
+
and reason about the fix itself.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import re
|
| 11 |
+
|
| 12 |
+
from core.validation.validator import validate_ci_stages, parse_stages
|
| 13 |
+
|
| 14 |
+
from core.utils.packages import get_packages_set
|
| 15 |
+
|
| 16 |
+
CORRECT_PYTHON_VERSION = "3.11"
|
| 17 |
+
REQUIRED_ENV_VARS = {"DATABASE_URL", "API_KEY", "SECRET_KEY"}
|
| 18 |
+
|
| 19 |
+
def run_install_stage(filesystem: dict, task: str = "easy") -> dict:
|
| 20 |
+
"""
|
| 21 |
+
Reads requirements.txt. Fails if any required package is missing, or
|
| 22 |
+
(hard task) if numpy is pinned to an incompatible version.
|
| 23 |
+
|
| 24 |
+
Version pins are stripped before comparing against the required set, so
|
| 25 |
+
`numpy==1.21` parses as `numpy` for the missing-package check.
|
| 26 |
+
"""
|
| 27 |
+
required = get_packages_set(task)
|
| 28 |
+
content = filesystem.get("requirements.txt", "")
|
| 29 |
+
installed = {
|
| 30 |
+
re.split(r"[=<>!]", line, 1)[0].strip().lower()
|
| 31 |
+
for line in content.splitlines()
|
| 32 |
+
if line.strip()
|
| 33 |
+
}
|
| 34 |
+
missing = required - installed
|
| 35 |
+
|
| 36 |
+
if missing:
|
| 37 |
+
pkg = sorted(missing)[0]
|
| 38 |
+
logs = "Collecting dependencies...\n"
|
| 39 |
+
logs += f" Collecting {pkg}\n"
|
| 40 |
+
logs += f"ERROR: Could not find a version that satisfies the requirement {pkg}\n"
|
| 41 |
+
logs += f"ERROR: No matching distribution found for {pkg}\n"
|
| 42 |
+
logs += " Add the missing package to requirements.txt.\n"
|
| 43 |
+
return {"exit_code": 1, "logs": logs}
|
| 44 |
+
|
| 45 |
+
if task == "hard" and "numpy==1.21" in content:
|
| 46 |
+
logs = "Collecting dependencies...\n"
|
| 47 |
+
logs += " Collecting numpy==1.21\n"
|
| 48 |
+
logs += "ERROR: ResolutionImpossible: dependency conflict detected.\n"
|
| 49 |
+
logs += " requirements.txt pins numpy==1.21, but a transitive dependency\n"
|
| 50 |
+
logs += " requires numpy>=1.26. Update the numpy pin in requirements.txt.\n"
|
| 51 |
+
return {"exit_code": 1, "logs": logs}
|
| 52 |
+
|
| 53 |
+
logs = "Collecting dependencies...\n"
|
| 54 |
+
logs += "Successfully installed " + " ".join(sorted(required)) + "\n"
|
| 55 |
+
return {"exit_code": 0, "logs": logs}
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def run_env_check_stage(filesystem: dict) -> dict:
|
| 59 |
+
"""
|
| 60 |
+
Checks that .env.ci defines all required environment variables.
|
| 61 |
+
Error reports which variables are required so the agent can compare against the file.
|
| 62 |
+
"""
|
| 63 |
+
content = filesystem.get(".env.ci", "")
|
| 64 |
+
defined = {
|
| 65 |
+
line.split("=")[0].strip()
|
| 66 |
+
for line in content.splitlines()
|
| 67 |
+
if "=" in line and line.strip()
|
| 68 |
+
}
|
| 69 |
+
missing = REQUIRED_ENV_VARS - defined
|
| 70 |
+
|
| 71 |
+
if not missing:
|
| 72 |
+
logs = "Environment check passed.\n"
|
| 73 |
+
logs += "All required variables present: " + ", ".join(sorted(REQUIRED_ENV_VARS)) + "\n"
|
| 74 |
+
return {"exit_code": 0, "logs": logs}
|
| 75 |
+
else:
|
| 76 |
+
logs = "Environment check failed.\n"
|
| 77 |
+
logs += f" Required variables: {', '.join(sorted(REQUIRED_ENV_VARS))}\n"
|
| 78 |
+
logs += " Not all required variables are defined in .env.ci.\n"
|
| 79 |
+
return {"exit_code": 1, "logs": logs}
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def run_docker_build_stage(filesystem: dict, strict_tag: bool = False) -> dict:
|
| 83 |
+
"""
|
| 84 |
+
Checks that Dockerfile uses the correct Python base image.
|
| 85 |
+
|
| 86 |
+
In the default (medium) mode, any `python:3.11` tag is accepted. With
|
| 87 |
+
`strict_tag=True` (hard task), the full tag must be `python:3.11-slim` —
|
| 88 |
+
the alpine variant is rejected because it lacks the glibc-based system
|
| 89 |
+
libraries that the project's native-code dependencies need.
|
| 90 |
+
"""
|
| 91 |
+
content = filesystem.get("Dockerfile", "")
|
| 92 |
+
|
| 93 |
+
if strict_tag:
|
| 94 |
+
if "python:3.11-slim" in content:
|
| 95 |
+
logs = "Step 1/5 : FROM python:3.11-slim\n"
|
| 96 |
+
logs += "Successfully built docker image\n"
|
| 97 |
+
return {"exit_code": 0, "logs": logs}
|
| 98 |
+
logs = "Step 1/5 : FROM ...\n"
|
| 99 |
+
logs += "ERROR: Docker build failed.\n"
|
| 100 |
+
logs += " Base image rejected: the alpine variant lacks the system\n"
|
| 101 |
+
logs += " libraries (glibc, build tools) required by the project's\n"
|
| 102 |
+
logs += " native dependencies.\n"
|
| 103 |
+
logs += " Expected base image: python:3.11-slim\n"
|
| 104 |
+
logs += " Inspect your Dockerfile.\n"
|
| 105 |
+
return {"exit_code": 1, "logs": logs}
|
| 106 |
+
|
| 107 |
+
if f"python:{CORRECT_PYTHON_VERSION}" in content:
|
| 108 |
+
logs = f"Step 1/5 : FROM python:{CORRECT_PYTHON_VERSION}-slim\n"
|
| 109 |
+
logs += "Successfully built docker image\n"
|
| 110 |
+
return {"exit_code": 0, "logs": logs}
|
| 111 |
+
logs = "Step 1/5 : FROM ...\n"
|
| 112 |
+
logs += "ERROR: Docker build failed.\n"
|
| 113 |
+
logs += f" Expected base image: python:{CORRECT_PYTHON_VERSION}-slim\n"
|
| 114 |
+
logs += " Inspect your Dockerfile.\n"
|
| 115 |
+
return {"exit_code": 1, "logs": logs}
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def run_config_validate_stage(filesystem: dict) -> dict:
|
| 119 |
+
"""
|
| 120 |
+
Checks that deploy_config.yml has deploy_enabled: true.
|
| 121 |
+
Error reports expected field value so the agent can compare against the file.
|
| 122 |
+
"""
|
| 123 |
+
content = filesystem.get("deploy_config.yml", "")
|
| 124 |
+
if "deploy_enabled: true" in content:
|
| 125 |
+
logs = "Deployment configuration validated.\n"
|
| 126 |
+
logs += "All required settings are correct.\n"
|
| 127 |
+
return {"exit_code": 0, "logs": logs}
|
| 128 |
+
else:
|
| 129 |
+
logs = "ERROR: Deployment configuration is invalid.\n"
|
| 130 |
+
logs += " deploy_enabled must be 'true' in deploy_config.yml.\n"
|
| 131 |
+
return {"exit_code": 1, "logs": logs}
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def run_smoke_test_stage(filesystem: dict) -> dict:
|
| 135 |
+
"""Always passes — reached only when prior stages succeed."""
|
| 136 |
+
return {"exit_code": 0, "logs": "Smoke test passed. Application started successfully.\n"}
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def run_test_stage(filesystem: dict) -> dict:
|
| 140 |
+
"""
|
| 141 |
+
Checks that the Makefile's test target runs pytest correctly.
|
| 142 |
+
Fails if the command is --collect-only, --dry-run, or uses unittest.
|
| 143 |
+
Error reports expected command so the agent can compare against the file.
|
| 144 |
+
"""
|
| 145 |
+
content = filesystem.get("Makefile", "")
|
| 146 |
+
correct = (
|
| 147 |
+
"python -m pytest tests/" in content
|
| 148 |
+
and "--collect-only" not in content
|
| 149 |
+
and "--dry-run" not in content
|
| 150 |
+
and "unittest" not in content
|
| 151 |
+
)
|
| 152 |
+
if correct:
|
| 153 |
+
logs = "Running test suite...\n"
|
| 154 |
+
logs += "All tests passed.\n"
|
| 155 |
+
return {"exit_code": 0, "logs": logs}
|
| 156 |
+
else:
|
| 157 |
+
logs = "ERROR: Test runner failed.\n"
|
| 158 |
+
logs += " Expected test command: python -m pytest tests/\n"
|
| 159 |
+
logs += " Check the test command in your Makefile.\n"
|
| 160 |
+
return {"exit_code": 1, "logs": logs}
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def run_port_check_stage(filesystem: dict) -> dict:
|
| 164 |
+
"""
|
| 165 |
+
Checks that service.yaml exposes port 8080.
|
| 166 |
+
Error reports expected port so the agent can compare against the file.
|
| 167 |
+
"""
|
| 168 |
+
content = filesystem.get("service.yaml", "")
|
| 169 |
+
if "port: 8080" in content:
|
| 170 |
+
logs = "Service configuration validated.\n"
|
| 171 |
+
logs += "Port binding is correct.\n"
|
| 172 |
+
return {"exit_code": 0, "logs": logs}
|
| 173 |
+
else:
|
| 174 |
+
logs = "ERROR: Service configuration validation failed.\n"
|
| 175 |
+
logs += " Expected port: 8080. Inspect service.yaml.\n"
|
| 176 |
+
return {"exit_code": 1, "logs": logs}
|
| 177 |
+
|
| 178 |
+
def run_ci_validate_stage(filesystem: dict) -> dict:
|
| 179 |
+
"""
|
| 180 |
+
Checks that ci.yml defines stages in the correct order: install (and
|
| 181 |
+
build, if present) must appear before test.
|
| 182 |
+
|
| 183 |
+
Expected format is a single comma-separated line like:
|
| 184 |
+
stages: install, build, test
|
| 185 |
+
"""
|
| 186 |
+
content = filesystem.get("ci.yml", "")
|
| 187 |
+
try:
|
| 188 |
+
validate_ci_stages(content)
|
| 189 |
+
except ValueError as e:
|
| 190 |
+
return {"exit_code": 1, "logs": f"ERROR: {str(e)}\n"}
|
| 191 |
+
|
| 192 |
+
return {"exit_code": 0, "logs": "ci.yml validation passed. Stage order is correct.\n"}
|
| 193 |
+
|
| 194 |
+
def _run_medium_type_a_pipeline(filesystem: dict) -> dict:
|
| 195 |
+
"""install (always passes) → env_check → docker_build"""
|
| 196 |
+
combined_logs = ""
|
| 197 |
+
|
| 198 |
+
install = run_install_stage(filesystem, task="medium")
|
| 199 |
+
combined_logs += "=== Stage: install ===\n" + install["logs"]
|
| 200 |
+
if install["exit_code"] != 0:
|
| 201 |
+
return {"exit_code": 1, "logs": combined_logs}
|
| 202 |
+
|
| 203 |
+
env = run_env_check_stage(filesystem)
|
| 204 |
+
combined_logs += "=== Stage: env_check ===\n" + env["logs"]
|
| 205 |
+
if env["exit_code"] != 0:
|
| 206 |
+
return {"exit_code": 1, "logs": combined_logs}
|
| 207 |
+
|
| 208 |
+
docker = run_docker_build_stage(filesystem)
|
| 209 |
+
combined_logs += "=== Stage: docker_build ===\n" + docker["logs"]
|
| 210 |
+
return {"exit_code": docker["exit_code"], "logs": combined_logs}
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
def _run_medium_type_b_pipeline(filesystem: dict) -> dict:
|
| 214 |
+
"""install (missing pkg) → config_validate → smoke_test"""
|
| 215 |
+
combined_logs = ""
|
| 216 |
+
|
| 217 |
+
install = run_install_stage(filesystem, task="medium")
|
| 218 |
+
combined_logs += "=== Stage: install ===\n" + install["logs"]
|
| 219 |
+
if install["exit_code"] != 0:
|
| 220 |
+
return {"exit_code": 1, "logs": combined_logs}
|
| 221 |
+
|
| 222 |
+
config = run_config_validate_stage(filesystem)
|
| 223 |
+
combined_logs += "=== Stage: config_validate ===\n" + config["logs"]
|
| 224 |
+
if config["exit_code"] != 0:
|
| 225 |
+
return {"exit_code": 1, "logs": combined_logs}
|
| 226 |
+
|
| 227 |
+
smoke = run_smoke_test_stage(filesystem)
|
| 228 |
+
combined_logs += "=== Stage: smoke_test ===\n" + smoke["logs"]
|
| 229 |
+
return {"exit_code": smoke["exit_code"], "logs": combined_logs}
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
def _run_medium_type_c_pipeline(filesystem: dict) -> dict:
|
| 233 |
+
"""install (always passes) → env_check → test (Makefile)"""
|
| 234 |
+
combined_logs = ""
|
| 235 |
+
|
| 236 |
+
install = run_install_stage(filesystem, task="medium")
|
| 237 |
+
combined_logs += "=== Stage: install ===\n" + install["logs"]
|
| 238 |
+
if install["exit_code"] != 0:
|
| 239 |
+
return {"exit_code": 1, "logs": combined_logs}
|
| 240 |
+
|
| 241 |
+
env = run_env_check_stage(filesystem)
|
| 242 |
+
combined_logs += "=== Stage: env_check ===\n" + env["logs"]
|
| 243 |
+
if env["exit_code"] != 0:
|
| 244 |
+
return {"exit_code": 1, "logs": combined_logs}
|
| 245 |
+
|
| 246 |
+
test = run_test_stage(filesystem)
|
| 247 |
+
combined_logs += "=== Stage: test ===\n" + test["logs"]
|
| 248 |
+
return {"exit_code": test["exit_code"], "logs": combined_logs}
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
def _run_medium_type_d_pipeline(filesystem: dict) -> dict:
|
| 252 |
+
"""install (always passes) → port_check → docker_build"""
|
| 253 |
+
combined_logs = ""
|
| 254 |
+
|
| 255 |
+
install = run_install_stage(filesystem, task="medium")
|
| 256 |
+
combined_logs += "=== Stage: install ===\n" + install["logs"]
|
| 257 |
+
if install["exit_code"] != 0:
|
| 258 |
+
return {"exit_code": 1, "logs": combined_logs}
|
| 259 |
+
|
| 260 |
+
port = run_port_check_stage(filesystem)
|
| 261 |
+
combined_logs += "=== Stage: port_check ===\n" + port["logs"]
|
| 262 |
+
if port["exit_code"] != 0:
|
| 263 |
+
return {"exit_code": 1, "logs": combined_logs}
|
| 264 |
+
|
| 265 |
+
docker = run_docker_build_stage(filesystem)
|
| 266 |
+
combined_logs += "=== Stage: docker_build ===\n" + docker["logs"]
|
| 267 |
+
return {"exit_code": docker["exit_code"], "logs": combined_logs}
|
| 268 |
+
|
| 269 |
+
|
| 270 |
+
def _run_medium_pipeline(filesystem: dict) -> dict:
|
| 271 |
+
"""
|
| 272 |
+
Detects scenario type from the filesystem and dispatches to the right runner.
|
| 273 |
+
Each type has a unique distinguishing file:
|
| 274 |
+
Type B → deploy_config.yml
|
| 275 |
+
Type C → Makefile
|
| 276 |
+
Type D → service.yaml
|
| 277 |
+
Type A → fallback (Dockerfile + .env.ci)
|
| 278 |
+
"""
|
| 279 |
+
if "Makefile" in filesystem:
|
| 280 |
+
return _run_medium_type_c_pipeline(filesystem)
|
| 281 |
+
elif "deploy_config.yml" in filesystem:
|
| 282 |
+
return _run_medium_type_b_pipeline(filesystem)
|
| 283 |
+
elif "service.yaml" in filesystem:
|
| 284 |
+
return _run_medium_type_d_pipeline(filesystem)
|
| 285 |
+
else:
|
| 286 |
+
return _run_medium_type_a_pipeline(filesystem)
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
def _run_hard_type_a_pipeline(filesystem: dict) -> dict:
|
| 290 |
+
"""ci_validate → docker_build(strict) → install(hard)."""
|
| 291 |
+
combined_logs = ""
|
| 292 |
+
|
| 293 |
+
ci = run_ci_validate_stage(filesystem)
|
| 294 |
+
combined_logs += "=== Stage: ci_validate ===\n" + ci["logs"]
|
| 295 |
+
if ci["exit_code"] != 0:
|
| 296 |
+
return {"exit_code": 1, "logs": combined_logs}
|
| 297 |
+
|
| 298 |
+
docker = run_docker_build_stage(filesystem, strict_tag=True)
|
| 299 |
+
combined_logs += "=== Stage: docker_build ===\n" + docker["logs"]
|
| 300 |
+
if docker["exit_code"] != 0:
|
| 301 |
+
return {"exit_code": 1, "logs": combined_logs}
|
| 302 |
+
|
| 303 |
+
install = run_install_stage(filesystem, task="hard")
|
| 304 |
+
combined_logs += "=== Stage: install ===\n" + install["logs"]
|
| 305 |
+
return {"exit_code": install["exit_code"], "logs": combined_logs}
|
| 306 |
+
|
| 307 |
+
|
| 308 |
+
def _run_hard_type_b_pipeline(filesystem: dict) -> dict:
|
| 309 |
+
"""ci_validate → env_check → test (Makefile)."""
|
| 310 |
+
combined_logs = ""
|
| 311 |
+
|
| 312 |
+
ci = run_ci_validate_stage(filesystem)
|
| 313 |
+
combined_logs += "=== Stage: ci_validate ===\n" + ci["logs"]
|
| 314 |
+
if ci["exit_code"] != 0:
|
| 315 |
+
return {"exit_code": 1, "logs": combined_logs}
|
| 316 |
+
|
| 317 |
+
env = run_env_check_stage(filesystem)
|
| 318 |
+
combined_logs += "=== Stage: env_check ===\n" + env["logs"]
|
| 319 |
+
if env["exit_code"] != 0:
|
| 320 |
+
return {"exit_code": 1, "logs": combined_logs}
|
| 321 |
+
|
| 322 |
+
test = run_test_stage(filesystem)
|
| 323 |
+
combined_logs += "=== Stage: test ===\n" + test["logs"]
|
| 324 |
+
return {"exit_code": test["exit_code"], "logs": combined_logs}
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
def _run_hard_type_c_pipeline(filesystem: dict) -> dict:
|
| 328 |
+
"""docker_build(strict) → config_validate → port_check."""
|
| 329 |
+
combined_logs = ""
|
| 330 |
+
|
| 331 |
+
docker = run_docker_build_stage(filesystem, strict_tag=True)
|
| 332 |
+
combined_logs += "=== Stage: docker_build ===\n" + docker["logs"]
|
| 333 |
+
if docker["exit_code"] != 0:
|
| 334 |
+
return {"exit_code": 1, "logs": combined_logs}
|
| 335 |
+
|
| 336 |
+
config = run_config_validate_stage(filesystem)
|
| 337 |
+
combined_logs += "=== Stage: config_validate ===\n" + config["logs"]
|
| 338 |
+
if config["exit_code"] != 0:
|
| 339 |
+
return {"exit_code": 1, "logs": combined_logs}
|
| 340 |
+
|
| 341 |
+
port = run_port_check_stage(filesystem)
|
| 342 |
+
combined_logs += "=== Stage: port_check ===\n" + port["logs"]
|
| 343 |
+
return {"exit_code": port["exit_code"], "logs": combined_logs}
|
| 344 |
+
|
| 345 |
+
|
| 346 |
+
def _run_hard_type_d_pipeline(filesystem: dict) -> dict:
|
| 347 |
+
"""install(hard, missing pkg) → env_check → docker_build(strict)."""
|
| 348 |
+
combined_logs = ""
|
| 349 |
+
|
| 350 |
+
install = run_install_stage(filesystem, task="hard")
|
| 351 |
+
combined_logs += "=== Stage: install ===\n" + install["logs"]
|
| 352 |
+
if install["exit_code"] != 0:
|
| 353 |
+
return {"exit_code": 1, "logs": combined_logs}
|
| 354 |
+
|
| 355 |
+
env = run_env_check_stage(filesystem)
|
| 356 |
+
combined_logs += "=== Stage: env_check ===\n" + env["logs"]
|
| 357 |
+
if env["exit_code"] != 0:
|
| 358 |
+
return {"exit_code": 1, "logs": combined_logs}
|
| 359 |
+
|
| 360 |
+
docker = run_docker_build_stage(filesystem, strict_tag=True)
|
| 361 |
+
combined_logs += "=== Stage: docker_build ===\n" + docker["logs"]
|
| 362 |
+
return {"exit_code": docker["exit_code"], "logs": combined_logs}
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
def _run_hard_pipeline(filesystem: dict) -> dict:
|
| 366 |
+
"""
|
| 367 |
+
Detects hard scenario variant from the filesystem and dispatches.
|
| 368 |
+
Cascading failures: each stage is only reached after the previous
|
| 369 |
+
passes, so bugs surface one at a time as the agent fixes them.
|
| 370 |
+
|
| 371 |
+
Distinguishing markers:
|
| 372 |
+
Type B → ci.yml + Makefile
|
| 373 |
+
Type A → ci.yml alone
|
| 374 |
+
Type C → service.yaml (with deploy_config.yml)
|
| 375 |
+
Type D → fallback (Dockerfile + .env.ci, no ci.yml)
|
| 376 |
+
"""
|
| 377 |
+
if "ci.yml" in filesystem and "Makefile" in filesystem:
|
| 378 |
+
return _run_hard_type_b_pipeline(filesystem)
|
| 379 |
+
if "ci.yml" in filesystem:
|
| 380 |
+
return _run_hard_type_a_pipeline(filesystem)
|
| 381 |
+
if "service.yaml" in filesystem:
|
| 382 |
+
return _run_hard_type_c_pipeline(filesystem)
|
| 383 |
+
return _run_hard_type_d_pipeline(filesystem)
|
| 384 |
+
|
| 385 |
+
|
| 386 |
+
def run_pipeline(filesystem: dict, task: str = "easy") -> dict:
|
| 387 |
+
"""
|
| 388 |
+
Top-level dispatcher. Call this from environment.py.
|
| 389 |
+
Returns {"exit_code": int, "logs": str}.
|
| 390 |
+
"""
|
| 391 |
+
if task == "medium":
|
| 392 |
+
return _run_medium_pipeline(filesystem)
|
| 393 |
+
elif task == "hard":
|
| 394 |
+
return _run_hard_pipeline(filesystem)
|
| 395 |
+
else:
|
| 396 |
+
return run_install_stage(filesystem, task=task)
|
core/scenarios/generator.py
ADDED
|
@@ -0,0 +1,433 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Procedural scenario generator for the CI/CD Doctor environment.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import random
|
| 6 |
+
|
| 7 |
+
from core.utils.packages import get_packages
|
| 8 |
+
|
| 9 |
+
PYTHON_VERSIONS = ["3.9", "3.10"] # always wrong; correct is 3.11
|
| 10 |
+
REQUIRED_ENV_VARS = ["DATABASE_URL", "API_KEY", "SECRET_KEY"]
|
| 11 |
+
WRONG_PORTS = [3000, 5000, 9000]
|
| 12 |
+
WRONG_TEST_COMMANDS = [
|
| 13 |
+
"python -m pytest tests/ --collect-only", # collects but never runs
|
| 14 |
+
"python -m unittest discover tests/", # wrong runner
|
| 15 |
+
"python -m pytest tests/ --dry-run", # dry-run, no output
|
| 16 |
+
]
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def generate_easy_scenario(seed: int) -> dict:
|
| 20 |
+
"""
|
| 21 |
+
Returns a filesystem dict + answer_key.
|
| 22 |
+
The filesystem has requirements.txt missing one required package.
|
| 23 |
+
"""
|
| 24 |
+
rng = random.Random(seed)
|
| 25 |
+
all_packages = get_packages("easy")
|
| 26 |
+
missing = rng.choice(all_packages)
|
| 27 |
+
present = [p for p in all_packages if p != missing]
|
| 28 |
+
|
| 29 |
+
return {
|
| 30 |
+
"filesystem": {
|
| 31 |
+
"requirements.txt": "\n".join(present) + "\n",
|
| 32 |
+
"pipeline.yaml": "stages:\n - install\n",
|
| 33 |
+
"logs/install.log": "",
|
| 34 |
+
"app.py": "import flask\nimport numpy\n# app code here\n",
|
| 35 |
+
},
|
| 36 |
+
"answer_key": {
|
| 37 |
+
"fixes": {
|
| 38 |
+
"requirements.txt": missing,
|
| 39 |
+
},
|
| 40 |
+
},
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
def _medium_type_a(rng: random.Random, all_packages: list) -> dict:
|
| 44 |
+
"""
|
| 45 |
+
Type A: wrong Python version (Dockerfile) + missing env var (.env.ci).
|
| 46 |
+
Pipeline: install → env_check → docker_build
|
| 47 |
+
Both files must be fixed. install always passes.
|
| 48 |
+
"""
|
| 49 |
+
wrong_version = rng.choice(PYTHON_VERSIONS)
|
| 50 |
+
missing_var = rng.choice(REQUIRED_ENV_VARS)
|
| 51 |
+
present_vars = {v: "placeholder" for v in REQUIRED_ENV_VARS if v != missing_var}
|
| 52 |
+
env_ci_content = "".join(f"{k}={v}\n" for k, v in sorted(present_vars.items()))
|
| 53 |
+
|
| 54 |
+
return {
|
| 55 |
+
"filesystem": {
|
| 56 |
+
"requirements.txt": "\n".join(all_packages) + "\n",
|
| 57 |
+
"Dockerfile": (
|
| 58 |
+
f"FROM python:{wrong_version}-slim\n"
|
| 59 |
+
"WORKDIR /app\n"
|
| 60 |
+
"COPY requirements.txt .\n"
|
| 61 |
+
"RUN pip install -r requirements.txt\n"
|
| 62 |
+
"COPY . .\n"
|
| 63 |
+
'CMD ["python", "app.py"]\n'
|
| 64 |
+
),
|
| 65 |
+
".env.ci": env_ci_content,
|
| 66 |
+
"pipeline.yaml": "stages:\n - install\n - env_check\n - docker_build\n",
|
| 67 |
+
"app.py": "import flask\n# app code here\n",
|
| 68 |
+
"logs/install.log": "",
|
| 69 |
+
},
|
| 70 |
+
"answer_key": {
|
| 71 |
+
"fixes": {
|
| 72 |
+
"Dockerfile": "python:3.11",
|
| 73 |
+
".env.ci": missing_var,
|
| 74 |
+
},
|
| 75 |
+
},
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def _medium_type_b(rng: random.Random, all_packages: list) -> dict:
|
| 80 |
+
"""
|
| 81 |
+
Type B: missing package (requirements.txt) + deployment flag off (deploy_config.yml).
|
| 82 |
+
Pipeline: install → config_validate → smoke_test
|
| 83 |
+
install fails first; after fixing, config_validate fails.
|
| 84 |
+
"""
|
| 85 |
+
missing_pkg = rng.choice(all_packages)
|
| 86 |
+
present_pkgs = [p for p in all_packages if p != missing_pkg]
|
| 87 |
+
|
| 88 |
+
return {
|
| 89 |
+
"filesystem": {
|
| 90 |
+
"requirements.txt": "\n".join(present_pkgs) + "\n",
|
| 91 |
+
"deploy_config.yml": (
|
| 92 |
+
"target_env: production\n"
|
| 93 |
+
"deploy_enabled: false\n" # BUG: must be true
|
| 94 |
+
"replicas: 2\n"
|
| 95 |
+
"health_check_path: /health\n"
|
| 96 |
+
"timeout: 30\n"
|
| 97 |
+
),
|
| 98 |
+
"pipeline.yaml": "stages:\n - install\n - config_validate\n - smoke_test\n",
|
| 99 |
+
"app.py": "import flask\n# app code here\n",
|
| 100 |
+
"logs/install.log": "",
|
| 101 |
+
},
|
| 102 |
+
"answer_key": {
|
| 103 |
+
"fixes": {
|
| 104 |
+
"requirements.txt": missing_pkg,
|
| 105 |
+
"deploy_config.yml": "deploy_enabled: true",
|
| 106 |
+
},
|
| 107 |
+
},
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def _medium_type_c(rng: random.Random, all_packages: list) -> dict:
|
| 112 |
+
"""
|
| 113 |
+
Type C: wrong test command (Makefile) + missing env var (.env.ci).
|
| 114 |
+
Pipeline: install → env_check → test
|
| 115 |
+
env_check fails first; after fixing, test fails due to bad Makefile.
|
| 116 |
+
"""
|
| 117 |
+
wrong_cmd = rng.choice(WRONG_TEST_COMMANDS)
|
| 118 |
+
missing_var = rng.choice(REQUIRED_ENV_VARS)
|
| 119 |
+
present_vars = {v: "placeholder" for v in REQUIRED_ENV_VARS if v != missing_var}
|
| 120 |
+
env_ci_content = "".join(f"{k}={v}\n" for k, v in sorted(present_vars.items()))
|
| 121 |
+
|
| 122 |
+
return {
|
| 123 |
+
"filesystem": {
|
| 124 |
+
"requirements.txt": "\n".join(all_packages) + "\n",
|
| 125 |
+
".env.ci": env_ci_content,
|
| 126 |
+
"Makefile": (
|
| 127 |
+
".PHONY: test\n"
|
| 128 |
+
"test:\n"
|
| 129 |
+
f"\t{wrong_cmd}\n"
|
| 130 |
+
),
|
| 131 |
+
"pipeline.yaml": "stages:\n - install\n - env_check\n - test\n",
|
| 132 |
+
"app.py": "import flask\n# app code here\n",
|
| 133 |
+
"logs/install.log": "",
|
| 134 |
+
},
|
| 135 |
+
"answer_key": {
|
| 136 |
+
"fixes": {
|
| 137 |
+
".env.ci": missing_var,
|
| 138 |
+
"Makefile": "python -m pytest tests/",
|
| 139 |
+
},
|
| 140 |
+
},
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def _medium_type_d(rng: random.Random, all_packages: list) -> dict:
|
| 145 |
+
"""
|
| 146 |
+
Type D: wrong port (service.yaml) + wrong Python version (Dockerfile).
|
| 147 |
+
Pipeline: install → port_check → docker_build
|
| 148 |
+
port_check fails first; after fixing, docker_build fails.
|
| 149 |
+
install always passes.
|
| 150 |
+
"""
|
| 151 |
+
wrong_version = rng.choice(PYTHON_VERSIONS)
|
| 152 |
+
wrong_port = rng.choice(WRONG_PORTS)
|
| 153 |
+
|
| 154 |
+
return {
|
| 155 |
+
"filesystem": {
|
| 156 |
+
"requirements.txt": "\n".join(all_packages) + "\n",
|
| 157 |
+
"Dockerfile": (
|
| 158 |
+
f"FROM python:{wrong_version}-slim\n"
|
| 159 |
+
"WORKDIR /app\n"
|
| 160 |
+
"COPY requirements.txt .\n"
|
| 161 |
+
"RUN pip install -r requirements.txt\n"
|
| 162 |
+
"COPY . .\n"
|
| 163 |
+
'CMD ["python", "app.py"]\n'
|
| 164 |
+
),
|
| 165 |
+
"service.yaml": (
|
| 166 |
+
"apiVersion: v1\n"
|
| 167 |
+
"kind: Service\n"
|
| 168 |
+
"metadata:\n"
|
| 169 |
+
" name: app\n"
|
| 170 |
+
"spec:\n"
|
| 171 |
+
f" port: {wrong_port}\n"
|
| 172 |
+
),
|
| 173 |
+
"pipeline.yaml": "stages:\n - install\n - port_check\n - docker_build\n",
|
| 174 |
+
"app.py": "import flask\n# app code here\n",
|
| 175 |
+
"logs/install.log": "",
|
| 176 |
+
},
|
| 177 |
+
"answer_key": {
|
| 178 |
+
"fixes": {
|
| 179 |
+
"service.yaml": "port: 8080",
|
| 180 |
+
"Dockerfile": "python:3.11",
|
| 181 |
+
},
|
| 182 |
+
},
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
def generate_medium_scenario(seed: int) -> dict:
|
| 187 |
+
"""
|
| 188 |
+
Randomly selects one of four structurally distinct medium scenario types,
|
| 189 |
+
then generates the specifics (which var, which version, etc.) from the
|
| 190 |
+
same seed. Same seed → same scenario every time.
|
| 191 |
+
"""
|
| 192 |
+
rng = random.Random(seed)
|
| 193 |
+
all_packages = get_packages("medium")
|
| 194 |
+
scenario_type = rng.choice(["A", "B", "C", "D"])
|
| 195 |
+
|
| 196 |
+
if scenario_type == "A":
|
| 197 |
+
return _medium_type_a(rng, all_packages)
|
| 198 |
+
elif scenario_type == "B":
|
| 199 |
+
return _medium_type_b(rng, all_packages)
|
| 200 |
+
elif scenario_type == "C":
|
| 201 |
+
return _medium_type_c(rng, all_packages)
|
| 202 |
+
else:
|
| 203 |
+
return _medium_type_d(rng, all_packages)
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
def _hard_type_a(rng: random.Random, all_packages: list) -> dict:
|
| 207 |
+
"""
|
| 208 |
+
Type A (Interdependent):
|
| 209 |
+
ci.yml ordering → Dockerfile alpine → numpy version pin.
|
| 210 |
+
|
| 211 |
+
Behavior:
|
| 212 |
+
- docker_build fails first due to alpine
|
| 213 |
+
- AFTER fixing Docker base, install fails due to numpy incompatibility
|
| 214 |
+
- Demonstrates interdependent failures (fixing one reveals another)
|
| 215 |
+
|
| 216 |
+
Pipeline: ci_validate → docker_build(strict) → install(hard).
|
| 217 |
+
"""
|
| 218 |
+
_ = rng
|
| 219 |
+
|
| 220 |
+
# numpy version only breaks AFTER Docker base is fixed (alpine -> slim)
|
| 221 |
+
requirements_lines = [
|
| 222 |
+
"numpy==1.21" if pkg == "numpy" else pkg
|
| 223 |
+
for pkg in all_packages
|
| 224 |
+
]
|
| 225 |
+
|
| 226 |
+
# Randomize ci.yml format (inline vs YAML list)
|
| 227 |
+
if rng.random() < 0.5:
|
| 228 |
+
ci_content = "stages: test, build, install\n"
|
| 229 |
+
ci_fix = "stages: install, build, test\n"
|
| 230 |
+
else:
|
| 231 |
+
ci_content = (
|
| 232 |
+
"stages:\n"
|
| 233 |
+
" - test\n"
|
| 234 |
+
" - build\n"
|
| 235 |
+
" - install\n"
|
| 236 |
+
)
|
| 237 |
+
ci_fix = (
|
| 238 |
+
"stages:\n"
|
| 239 |
+
" - install\n"
|
| 240 |
+
" - build\n"
|
| 241 |
+
" - test\n"
|
| 242 |
+
)
|
| 243 |
+
|
| 244 |
+
return {
|
| 245 |
+
"filesystem": {
|
| 246 |
+
"requirements.txt": "\n".join(requirements_lines) + "\n",
|
| 247 |
+
"Dockerfile": (
|
| 248 |
+
"FROM python:3.11-alpine\n" # BUG 1: causes build failure first
|
| 249 |
+
"WORKDIR /app\n"
|
| 250 |
+
"COPY requirements.txt .\n"
|
| 251 |
+
"RUN pip install -r requirements.txt\n" # BUG 2 surfaces only after base fix
|
| 252 |
+
"COPY . .\n"
|
| 253 |
+
'CMD ["python", "app.py"]\n'
|
| 254 |
+
),
|
| 255 |
+
"ci.yml": ci_content,
|
| 256 |
+
"pipeline.yaml": "stages:\n - ci_validate\n - docker_build\n - install\n",
|
| 257 |
+
"app.py": "import flask\nimport numpy\n# app code here\n",
|
| 258 |
+
"logs/install.log": "",
|
| 259 |
+
},
|
| 260 |
+
"answer_key": {
|
| 261 |
+
"fixes": {
|
| 262 |
+
"ci.yml": ci_fix,
|
| 263 |
+
"Dockerfile": "python:3.11-slim",
|
| 264 |
+
"requirements.txt": "numpy==1.26",
|
| 265 |
+
},
|
| 266 |
+
},
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
+
|
| 270 |
+
def _hard_type_b(rng: random.Random, all_packages: list) -> dict:
|
| 271 |
+
"""
|
| 272 |
+
Type B (Interdependent):
|
| 273 |
+
ci.yml ordering → missing env var → wrong test command.
|
| 274 |
+
|
| 275 |
+
Behavior:
|
| 276 |
+
- env_check fails first
|
| 277 |
+
- AFTER fixing env, test stage fails due to bad command
|
| 278 |
+
|
| 279 |
+
Demonstrates dependency between runtime config and execution.
|
| 280 |
+
|
| 281 |
+
Pipeline: ci_validate → env_check → test.
|
| 282 |
+
"""
|
| 283 |
+
wrong_cmd = rng.choice(WRONG_TEST_COMMANDS)
|
| 284 |
+
missing_var = rng.choice(REQUIRED_ENV_VARS)
|
| 285 |
+
present_vars = {v: "placeholder" for v in REQUIRED_ENV_VARS if v != missing_var}
|
| 286 |
+
env_ci_content = "".join(f"{k}={v}\n" for k, v in sorted(present_vars.items()))
|
| 287 |
+
|
| 288 |
+
if rng.random() < 0.5:
|
| 289 |
+
ci_content = "stages: test, build, install\n"
|
| 290 |
+
ci_fix = "stages: install, build, test\n"
|
| 291 |
+
else:
|
| 292 |
+
ci_content = (
|
| 293 |
+
"stages:\n"
|
| 294 |
+
" - test\n"
|
| 295 |
+
" - build\n"
|
| 296 |
+
" - install\n"
|
| 297 |
+
)
|
| 298 |
+
ci_fix = (
|
| 299 |
+
"stages:\n"
|
| 300 |
+
" - install\n"
|
| 301 |
+
" - build\n"
|
| 302 |
+
" - test\n"
|
| 303 |
+
)
|
| 304 |
+
|
| 305 |
+
return {
|
| 306 |
+
"filesystem": {
|
| 307 |
+
"requirements.txt": "\n".join(all_packages) + "\n",
|
| 308 |
+
"ci.yml": ci_content,
|
| 309 |
+
".env.ci": env_ci_content,
|
| 310 |
+
"Makefile": (
|
| 311 |
+
".PHONY: test\n"
|
| 312 |
+
"test:\n"
|
| 313 |
+
f"\t{wrong_cmd}\n" # BUG surfaces only after env is fixed
|
| 314 |
+
),
|
| 315 |
+
"pipeline.yaml": "stages:\n - ci_validate\n - env_check\n - test\n",
|
| 316 |
+
"app.py": "import flask\n# app code here\n",
|
| 317 |
+
"logs/install.log": "",
|
| 318 |
+
},
|
| 319 |
+
"answer_key": {
|
| 320 |
+
"fixes": {
|
| 321 |
+
"ci.yml": ci_fix,
|
| 322 |
+
".env.ci": missing_var,
|
| 323 |
+
"Makefile": "python -m pytest tests/",
|
| 324 |
+
},
|
| 325 |
+
},
|
| 326 |
+
}
|
| 327 |
+
|
| 328 |
+
|
| 329 |
+
def _hard_type_c(rng: random.Random, all_packages: list) -> dict:
|
| 330 |
+
"""
|
| 331 |
+
Type C: Dockerfile alpine → deploy disabled → wrong service port.
|
| 332 |
+
Pipeline: docker_build(strict) → config_validate → port_check.
|
| 333 |
+
"""
|
| 334 |
+
_ = rng # reserved for future per-seed variation
|
| 335 |
+
wrong_port = rng.choice(WRONG_PORTS)
|
| 336 |
+
|
| 337 |
+
return {
|
| 338 |
+
"filesystem": {
|
| 339 |
+
"requirements.txt": "\n".join(all_packages) + "\n",
|
| 340 |
+
"Dockerfile": (
|
| 341 |
+
"FROM python:3.11-alpine\n"
|
| 342 |
+
"WORKDIR /app\n"
|
| 343 |
+
"COPY requirements.txt .\n"
|
| 344 |
+
"RUN pip install -r requirements.txt\n"
|
| 345 |
+
"COPY . .\n"
|
| 346 |
+
'CMD ["python", "app.py"]\n'
|
| 347 |
+
),
|
| 348 |
+
"deploy_config.yml": (
|
| 349 |
+
"target_env: production\n"
|
| 350 |
+
"deploy_enabled: false\n"
|
| 351 |
+
"replicas: 2\n"
|
| 352 |
+
"health_check_path: /health\n"
|
| 353 |
+
"timeout: 30\n"
|
| 354 |
+
),
|
| 355 |
+
"service.yaml": (
|
| 356 |
+
"apiVersion: v1\n"
|
| 357 |
+
"kind: Service\n"
|
| 358 |
+
"metadata:\n"
|
| 359 |
+
" name: app\n"
|
| 360 |
+
"spec:\n"
|
| 361 |
+
f" port: {wrong_port}\n"
|
| 362 |
+
),
|
| 363 |
+
"pipeline.yaml": "stages:\n - docker_build\n - config_validate\n - port_check\n",
|
| 364 |
+
"app.py": "import flask\n# app code here\n",
|
| 365 |
+
"logs/install.log": "",
|
| 366 |
+
},
|
| 367 |
+
"answer_key": {
|
| 368 |
+
"fixes": {
|
| 369 |
+
"Dockerfile": "python:3.11-slim",
|
| 370 |
+
"deploy_config.yml": "deploy_enabled: true",
|
| 371 |
+
"service.yaml": "port: 8080",
|
| 372 |
+
},
|
| 373 |
+
},
|
| 374 |
+
}
|
| 375 |
+
|
| 376 |
+
|
| 377 |
+
def _hard_type_d(rng: random.Random, all_packages: list) -> dict:
|
| 378 |
+
"""
|
| 379 |
+
Type D: missing package → missing env var → Dockerfile alpine.
|
| 380 |
+
Pipeline: install(hard) → env_check → docker_build(strict).
|
| 381 |
+
"""
|
| 382 |
+
missing_pkg = rng.choice(all_packages)
|
| 383 |
+
present_pkgs = [p for p in all_packages if p != missing_pkg]
|
| 384 |
+
missing_var = rng.choice(REQUIRED_ENV_VARS)
|
| 385 |
+
present_vars = {v: "placeholder" for v in REQUIRED_ENV_VARS if v != missing_var}
|
| 386 |
+
env_ci_content = "".join(f"{k}={v}\n" for k, v in sorted(present_vars.items()))
|
| 387 |
+
|
| 388 |
+
return {
|
| 389 |
+
"filesystem": {
|
| 390 |
+
"requirements.txt": "\n".join(present_pkgs) + "\n",
|
| 391 |
+
".env.ci": env_ci_content,
|
| 392 |
+
"Dockerfile": (
|
| 393 |
+
"FROM python:3.11-alpine\n"
|
| 394 |
+
"WORKDIR /app\n"
|
| 395 |
+
"COPY requirements.txt .\n"
|
| 396 |
+
"RUN pip install -r requirements.txt\n"
|
| 397 |
+
"COPY . .\n"
|
| 398 |
+
'CMD ["python", "app.py"]\n'
|
| 399 |
+
),
|
| 400 |
+
"pipeline.yaml": "stages:\n - install\n - env_check\n - docker_build\n",
|
| 401 |
+
"app.py": "import flask\n# app code here\n",
|
| 402 |
+
"logs/install.log": "",
|
| 403 |
+
},
|
| 404 |
+
"answer_key": {
|
| 405 |
+
"fixes": {
|
| 406 |
+
"requirements.txt": missing_pkg,
|
| 407 |
+
".env.ci": missing_var,
|
| 408 |
+
"Dockerfile": "python:3.11-slim",
|
| 409 |
+
},
|
| 410 |
+
},
|
| 411 |
+
}
|
| 412 |
+
|
| 413 |
+
|
| 414 |
+
def generate_hard_scenario(seed: int) -> dict:
|
| 415 |
+
"""
|
| 416 |
+
Randomly selects one of four structurally distinct hard scenario types,
|
| 417 |
+
then generates the specifics from the same seed. Each variant is a
|
| 418 |
+
three-fix cascading failure — each pipeline run stops at the first
|
| 419 |
+
failing stage, so bugs surface one at a time as the agent fixes them.
|
| 420 |
+
Same seed → same scenario every time.
|
| 421 |
+
"""
|
| 422 |
+
rng = random.Random(seed)
|
| 423 |
+
all_packages = get_packages("hard")
|
| 424 |
+
scenario_type = rng.choice(["A", "B", "C", "D"])
|
| 425 |
+
|
| 426 |
+
if scenario_type == "A":
|
| 427 |
+
return _hard_type_a(rng, all_packages)
|
| 428 |
+
elif scenario_type == "B":
|
| 429 |
+
return _hard_type_b(rng, all_packages)
|
| 430 |
+
elif scenario_type == "C":
|
| 431 |
+
return _hard_type_c(rng, all_packages)
|
| 432 |
+
else:
|
| 433 |
+
return _hard_type_d(rng, all_packages)
|
core/utils/packages.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Single source of truth for required packages across all task difficulties.
|
| 3 |
+
|
| 4 |
+
Add new entries here when implementing medium and hard scenarios.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
PACKAGES: dict[str, list[str]] = {
|
| 8 |
+
"easy": ["flask", "numpy", "pandas", "requests", "pydantic"],
|
| 9 |
+
"medium": ["flask", "numpy", "pandas", "requests", "pydantic"], # install always passes; failures are in Dockerfile + .env.ci
|
| 10 |
+
"hard": ["flask", "numpy", "pandas", "requests", "pydantic"], # install failure is from a numpy version conflict, not a missing package
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def get_packages(task: str) -> list[str]:
|
| 15 |
+
"""Return the required package list for a given task difficulty."""
|
| 16 |
+
if task not in PACKAGES:
|
| 17 |
+
raise ValueError(f"Unknown task: {task!r}. Valid tasks: {list(PACKAGES)}")
|
| 18 |
+
return PACKAGES[task]
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def get_packages_set(task: str) -> set[str]:
|
| 22 |
+
"""Return required packages as a set (used for O(1) lookup in stage_runner)."""
|
| 23 |
+
return set(get_packages(task))
|
core/validation/parser.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Free-form command parser for the CI/CD Doctor environment.
|
| 3 |
+
Converts raw command strings into structured ParsedCommand objects.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import re
|
| 7 |
+
from dataclasses import dataclass
|
| 8 |
+
from typing import Optional
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@dataclass
|
| 12 |
+
class ParsedCommand:
|
| 13 |
+
type: str # "cat" | "echo_append" | "sed" | "pipeline_run" | "pipeline_logs" | "pipeline_status" | "unknown"
|
| 14 |
+
filename: Optional[str] = None
|
| 15 |
+
content: Optional[str] = None # for echo >>
|
| 16 |
+
pattern: Optional[str] = None # for sed: old value
|
| 17 |
+
replacement: Optional[str] = None # for sed: new value
|
| 18 |
+
stage: Optional[str] = None
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def parse_command(command: str) -> ParsedCommand:
|
| 22 |
+
command = command.strip()
|
| 23 |
+
|
| 24 |
+
for sep in ["&&", ";"]:
|
| 25 |
+
if sep in command:
|
| 26 |
+
command = command.split(sep)[0].strip()
|
| 27 |
+
break
|
| 28 |
+
|
| 29 |
+
m = re.match(r"cat\s+(.+)", command)
|
| 30 |
+
if m:
|
| 31 |
+
return ParsedCommand(type="cat", filename=m.group(1).strip())
|
| 32 |
+
|
| 33 |
+
m = re.match(r'echo\s+([\'"])(.*?)\1\s*>>\s*(\S+)', command, re.DOTALL)
|
| 34 |
+
if m:
|
| 35 |
+
return ParsedCommand(
|
| 36 |
+
type="echo_append",
|
| 37 |
+
content=m.group(2),
|
| 38 |
+
filename=m.group(3),
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
# Example supported: sed -i 's/old/new/g' file OR sed -i "s|old|new|" file
|
| 42 |
+
m = re.match(
|
| 43 |
+
r"sed\s+-i\s+([\'\"]?)s(.)(.+?)\2(.*?)\2([g]*)\1\s+(\S+)",
|
| 44 |
+
command,
|
| 45 |
+
)
|
| 46 |
+
if m:
|
| 47 |
+
return ParsedCommand(
|
| 48 |
+
type="sed",
|
| 49 |
+
pattern=m.group(3),
|
| 50 |
+
replacement=m.group(4),
|
| 51 |
+
filename=m.group(6),
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
if re.fullmatch(r"pipeline\s+run", command):
|
| 55 |
+
return ParsedCommand(type="pipeline_run")
|
| 56 |
+
|
| 57 |
+
m = re.match(r"pipeline\s+logs(?:\s+(\S+))?\s*$", command)
|
| 58 |
+
if m:
|
| 59 |
+
return ParsedCommand(type="pipeline_logs", stage=m.group(1))
|
| 60 |
+
|
| 61 |
+
if re.fullmatch(r"pipeline\s+status", command):
|
| 62 |
+
return ParsedCommand(type="pipeline_status")
|
| 63 |
+
|
| 64 |
+
return ParsedCommand(type="unknown")
|
core/validation/validator.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
def parse_stages(content: str) -> list:
|
| 4 |
+
# Try inline format: stages: a, b, c
|
| 5 |
+
match = re.search(r"^stages:\s*(.+)$", content, re.MULTILINE)
|
| 6 |
+
if match and "," in match.group(1) and "\n" not in match.group(1):
|
| 7 |
+
return [s.strip() for s in match.group(1).split(",")]
|
| 8 |
+
|
| 9 |
+
lines = content.splitlines()
|
| 10 |
+
stages = []
|
| 11 |
+
in_stages_block = False
|
| 12 |
+
|
| 13 |
+
for line in lines:
|
| 14 |
+
if line.strip().startswith("stages:"):
|
| 15 |
+
in_stages_block = True
|
| 16 |
+
continue
|
| 17 |
+
|
| 18 |
+
if in_stages_block:
|
| 19 |
+
if line.strip().startswith("-"):
|
| 20 |
+
stages.append(line.strip().lstrip("- ").strip())
|
| 21 |
+
elif line.strip() == "":
|
| 22 |
+
continue
|
| 23 |
+
else:
|
| 24 |
+
break
|
| 25 |
+
|
| 26 |
+
return stages
|
| 27 |
+
|
| 28 |
+
def validate_ci_stages(ci_content: str):
|
| 29 |
+
stages = parse_stages(ci_content)
|
| 30 |
+
|
| 31 |
+
if not stages:
|
| 32 |
+
raise ValueError("ci.yml has no valid stages defined.")
|
| 33 |
+
|
| 34 |
+
if "install" not in stages or "test" not in stages:
|
| 35 |
+
raise ValueError("ci.yml must define both 'install' and 'test' stages.")
|
| 36 |
+
|
| 37 |
+
# Only check 'build' if it exists
|
| 38 |
+
if "build" in stages:
|
| 39 |
+
if stages.index("test") < stages.index("install") or stages.index("test") < stages.index("build"):
|
| 40 |
+
raise ValueError("ci.yml stage ordering is invalid.")
|
| 41 |
+
else:
|
| 42 |
+
if stages.index("test") < stages.index("install"):
|
| 43 |
+
raise ValueError("ci.yml stage ordering is invalid.")
|
models.py
CHANGED
|
@@ -3,7 +3,7 @@ Data models for the CI/CD Doctor RL environment.
|
|
| 3 |
"""
|
| 4 |
|
| 5 |
from pydantic import BaseModel, Field
|
| 6 |
-
from typing import
|
| 7 |
|
| 8 |
|
| 9 |
class PipelineAction(BaseModel):
|
|
|
|
| 3 |
"""
|
| 4 |
|
| 5 |
from pydantic import BaseModel, Field
|
| 6 |
+
from typing import Dict, Any, List
|
| 7 |
|
| 8 |
|
| 9 |
class PipelineAction(BaseModel):
|
openenv.yaml
CHANGED
|
@@ -12,7 +12,7 @@ tasks:
|
|
| 12 |
description: "Single-file failure: one required package is missing from requirements.txt. Inspect the install logs, identify the missing package, and add it."
|
| 13 |
grader:
|
| 14 |
type: programmatic
|
| 15 |
-
module: core.grader:grade
|
| 16 |
success_threshold: 0.70
|
| 17 |
prompt_template: |
|
| 18 |
Score the agent's CI/CD debugging session from 0.01 to 0.99
|
|
@@ -27,7 +27,7 @@ tasks:
|
|
| 27 |
description: "Two-file cascading failure (one of four randomized variants). Each variant pairs two bugs across files such as Dockerfile (wrong Python version or port), .env.ci (missing required env var), requirements.txt (missing package), deploy_config.yml (deploy_enabled false), Makefile (wrong test command), or service.yaml (wrong port). Bugs surface one at a time across the pipeline stages — fix both to make it pass."
|
| 28 |
grader:
|
| 29 |
type: programmatic
|
| 30 |
-
module: core.grader:grade
|
| 31 |
success_threshold: 0.60
|
| 32 |
prompt_template: |
|
| 33 |
Score the agent's CI/CD debugging session from 0.01 to 0.99
|
|
@@ -45,7 +45,7 @@ tasks:
|
|
| 45 |
description: "Three-file cascading failure (one of four randomized variants). Each variant chains three bugs across files such as ci.yml (wrong stage order), Dockerfile (alpine base incompatible with binary wheels), requirements.txt (missing package or wrong version pin like numpy==1.21), .env.ci (missing env var), Makefile (wrong test command), deploy_config.yml (deploy_enabled false), or service.yaml (wrong port). Each pipeline run stops at the first failing stage, so bugs reveal themselves one at a time — fix all three in sequence."
|
| 46 |
grader:
|
| 47 |
type: programmatic
|
| 48 |
-
module: core.grader:grade
|
| 49 |
success_threshold: 0.45
|
| 50 |
prompt_template: |
|
| 51 |
Score the agent's CI/CD debugging session from 0.01 to 0.99
|
|
|
|
| 12 |
description: "Single-file failure: one required package is missing from requirements.txt. Inspect the install logs, identify the missing package, and add it."
|
| 13 |
grader:
|
| 14 |
type: programmatic
|
| 15 |
+
module: core.grading.grader:grade
|
| 16 |
success_threshold: 0.70
|
| 17 |
prompt_template: |
|
| 18 |
Score the agent's CI/CD debugging session from 0.01 to 0.99
|
|
|
|
| 27 |
description: "Two-file cascading failure (one of four randomized variants). Each variant pairs two bugs across files such as Dockerfile (wrong Python version or port), .env.ci (missing required env var), requirements.txt (missing package), deploy_config.yml (deploy_enabled false), Makefile (wrong test command), or service.yaml (wrong port). Bugs surface one at a time across the pipeline stages — fix both to make it pass."
|
| 28 |
grader:
|
| 29 |
type: programmatic
|
| 30 |
+
module: core.grading.grader:grade
|
| 31 |
success_threshold: 0.60
|
| 32 |
prompt_template: |
|
| 33 |
Score the agent's CI/CD debugging session from 0.01 to 0.99
|
|
|
|
| 45 |
description: "Three-file cascading failure (one of four randomized variants). Each variant chains three bugs across files such as ci.yml (wrong stage order), Dockerfile (alpine base incompatible with binary wheels), requirements.txt (missing package or wrong version pin like numpy==1.21), .env.ci (missing env var), Makefile (wrong test command), deploy_config.yml (deploy_enabled false), or service.yaml (wrong port). Each pipeline run stops at the first failing stage, so bugs reveal themselves one at a time — fix all three in sequence."
|
| 46 |
grader:
|
| 47 |
type: programmatic
|
| 48 |
+
module: core.grading.grader:grade
|
| 49 |
success_threshold: 0.45
|
| 50 |
prompt_template: |
|
| 51 |
Score the agent's CI/CD debugging session from 0.01 to 0.99
|
openenv_CI_CD_Doctor.egg-info/SOURCES.txt
CHANGED
|
@@ -1,16 +1,19 @@
|
|
| 1 |
LICENSE
|
| 2 |
README.md
|
|
|
|
|
|
|
| 3 |
pyproject.toml
|
| 4 |
./__init__.py
|
|
|
|
| 5 |
./inference.py
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
openenv_CI_CD_Doctor.egg-info/PKG-INFO
|
| 15 |
openenv_CI_CD_Doctor.egg-info/SOURCES.txt
|
| 16 |
openenv_CI_CD_Doctor.egg-info/dependency_links.txt
|
|
@@ -19,4 +22,5 @@ openenv_CI_CD_Doctor.egg-info/requires.txt
|
|
| 19 |
openenv_CI_CD_Doctor.egg-info/top_level.txt
|
| 20 |
server/__init__.py
|
| 21 |
server/app.py
|
|
|
|
| 22 |
server/environment.py
|
|
|
|
| 1 |
LICENSE
|
| 2 |
README.md
|
| 3 |
+
__init__.py
|
| 4 |
+
inference.py
|
| 5 |
pyproject.toml
|
| 6 |
./__init__.py
|
| 7 |
+
./client.py
|
| 8 |
./inference.py
|
| 9 |
+
./models.py
|
| 10 |
+
core/__init__.py
|
| 11 |
+
core/generator.py
|
| 12 |
+
core/grader.py
|
| 13 |
+
core/packages.py
|
| 14 |
+
core/parser.py
|
| 15 |
+
core/stage_runner.py
|
| 16 |
+
core/validator.py
|
| 17 |
openenv_CI_CD_Doctor.egg-info/PKG-INFO
|
| 18 |
openenv_CI_CD_Doctor.egg-info/SOURCES.txt
|
| 19 |
openenv_CI_CD_Doctor.egg-info/dependency_links.txt
|
|
|
|
| 22 |
openenv_CI_CD_Doctor.egg-info/top_level.txt
|
| 23 |
server/__init__.py
|
| 24 |
server/app.py
|
| 25 |
+
server/app_2.py
|
| 26 |
server/environment.py
|
server/environment.py
CHANGED
|
@@ -6,10 +6,10 @@ Ties together generator, stage_runner, and parser into a step/reset/state loop.
|
|
| 6 |
import uuid
|
| 7 |
|
| 8 |
from models import PipelineAction, PipelineObservation, PipelineState
|
| 9 |
-
from core.generator import generate_easy_scenario, generate_medium_scenario, generate_hard_scenario
|
| 10 |
-
from core.stage_runner import run_pipeline
|
| 11 |
-
from core.parser import parse_command
|
| 12 |
-
from core.grader import grade as grade_state, balance_score, StepContext
|
| 13 |
|
| 14 |
MAX_STEPS_BY_TASK = {"easy": 10, "medium": 15, "hard": 25}
|
| 15 |
IDEAL_STEPS_BY_TASK = {"easy": 3, "medium": 6, "hard": 10}
|
|
|
|
| 6 |
import uuid
|
| 7 |
|
| 8 |
from models import PipelineAction, PipelineObservation, PipelineState
|
| 9 |
+
from core.scenarios.generator import generate_easy_scenario, generate_medium_scenario, generate_hard_scenario
|
| 10 |
+
from core.pipeline.stage_runner import run_pipeline
|
| 11 |
+
from core.validation.parser import parse_command
|
| 12 |
+
from core.grading.grader import grade as grade_state, balance_score, StepContext
|
| 13 |
|
| 14 |
MAX_STEPS_BY_TASK = {"easy": 10, "medium": 15, "hard": 25}
|
| 15 |
IDEAL_STEPS_BY_TASK = {"easy": 3, "medium": 6, "hard": 10}
|