Spaces:

uvpatel7271
/

openenv-python-env

Sleeping

App Files Files Community

uvpatel7271 commited on 10 days ago

Commit

558b89d

verified ·

1 Parent(s): 615272a

Upload folder using huggingface_hub

Browse files

Files changed (34) hide show

Dockerfile +3 -3
inference.py +37 -7
pytest-cache-files-1f62ra1g/container_sim/server/Dockerfile +32 -0
pytest-cache-files-1f62ra1g/container_sim/server/__init__.py +5 -0
pytest-cache-files-1f62ra1g/container_sim/server/app.py +127 -0
pytest-cache-files-1f62ra1g/container_sim/server/code_review_env_environment.py +9 -0
pytest-cache-files-1f62ra1g/container_sim/server/code_review_environment.py +5 -0
pytest-cache-files-1f62ra1g/container_sim/server/compat.py +89 -0
pytest-cache-files-1f62ra1g/container_sim/server/env.py +1 -0
pytest-cache-files-1f62ra1g/container_sim/server/env_safe.py +505 -0
pytest-cache-files-1f62ra1g/container_sim/server/graders/__init__.py +17 -0
pytest-cache-files-1f62ra1g/container_sim/server/graders/common.py +69 -0
pytest-cache-files-1f62ra1g/container_sim/server/graders/optimization.py +135 -0
pytest-cache-files-1f62ra1g/container_sim/server/graders/pytest_runner.py +121 -0
pytest-cache-files-1f62ra1g/container_sim/server/graders/syntax.py +60 -0
pytest-cache-files-1f62ra1g/container_sim/server/grading.py +147 -0
pytest-cache-files-1f62ra1g/container_sim/server/models.py +149 -0
pytest-cache-files-1f62ra1g/container_sim/server/python_env_environment.py +9 -0
pytest-cache-files-1f62ra1g/container_sim/server/requirements.txt +6 -0
pytest-cache-files-1f62ra1g/container_sim/server/static_review.py +273 -0
pytest-cache-files-1f62ra1g/container_sim/server/task_bank.py +340 -0
pytest-cache-files-1f62ra1g/container_sim/server/tasks/__init__.py +12 -0
pytest-cache-files-1f62ra1g/container_sim/server/tasks/task_bank.py +213 -0
server/app.py +21 -11
server/compat.py +89 -0
server/env_safe.py +26 -13
server/graders/__init__.py +17 -0
server/graders/common.py +69 -0
server/graders/optimization.py +135 -0
server/graders/pytest_runner.py +121 -0
server/graders/syntax.py +60 -0
server/models.py +149 -0
server/tasks/__init__.py +12 -0
server/tasks/task_bank.py +213 -0

Dockerfile CHANGED Viewed

@@ -10,11 +10,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     && rm -rf /var/lib/apt/lists/*
 # Install Python dependencies
-COPY server/requirements.txt /app/server/requirements.txt
 RUN pip install --no-cache-dir -r /app/server/requirements.txt
-# Copy source code
-COPY . /app
 # Set environment variables
 ENV PYTHONUNBUFFERED=1

     && rm -rf /var/lib/apt/lists/*
 # Install Python dependencies
+COPY requirements.txt /app/server/requirements.txt
 RUN pip install --no-cache-dir -r /app/server/requirements.txt
+# Copy the self-contained server package
+COPY . /app/server
 # Set environment variables
 ENV PYTHONUNBUFFERED=1

inference.py CHANGED Viewed

@@ -404,7 +404,7 @@ def run_env(client: Optional[Any], model: str) -> Dict[str, Any]:
 def format_step_message(result: Dict[str, Any]) -> str:
-    """Format the only allowed STEP line for stdout."""
     try:
         fallback = bool(result.get("fallback", False))
         reason = safe_text(result.get("reason", "completed"), "completed").lower().replace(" ", "_")
@@ -429,21 +429,49 @@ def format_step_message(result: Dict[str, Any]) -> str:
         return "error handled: formatting_failed"
 def main() -> int:
     """Run the inference workflow and always terminate successfully."""
     step_message = "error handled: initialization_failed"
     try:
         model_name = safe_env("MODEL_NAME", DEFAULT_MODEL_NAME) or DEFAULT_MODEL_NAME
         client = create_client()
         result = run_env(client, model_name)
         step_message = format_step_message(result)
     except BaseException as exc:
         step_message = f"error handled: {safe_text(exc, 'unexpected_failure').lower().replace(' ', '_')[:64]}"
     finally:
         try:
-            print("START")
-            print(f"STEP: {step_message}")
-            print("END")
         except Exception:
             pass
     return 0
@@ -454,9 +482,11 @@ if __name__ == "__main__":
         main()
     except BaseException:
         try:
-            print("START")
-            print("STEP: error handled: fatal_guard")
-            print("END")
         except Exception:
             pass
     sys.exit(0)

 def format_step_message(result: Dict[str, Any]) -> str:
+    """Format the structured STEP payload for stdout."""
     try:
         fallback = bool(result.get("fallback", False))
         reason = safe_text(result.get("reason", "completed"), "completed").lower().replace(" ", "_")
         return "error handled: formatting_failed"
+def format_start_message() -> str:
+    """Format the START payload for stdout."""
+    return "task=python_code_review_env"
+def format_end_message(result: Optional[Dict[str, Any]]) -> str:
+    """Format the structured END payload for stdout."""
+    try:
+        payload = result or {}
+        status = safe_text(payload.get("status", "ok"), "ok").lower().replace(" ", "_")
+        score = safe_float(payload.get("score", 0.0), 0.0)
+        done = str(bool(payload.get("done", True))).lower()
+        fallback = str(bool(payload.get("fallback", True))).lower()
+        return f"task=python_code_review_env status={status} score={score:.4f} done={done} fallback={fallback}"
+    except Exception:
+        return "task=python_code_review_env status=ok score=0.0000 done=true fallback=true"
+def emit_structured_output(start_message: str, step_message: str, end_message: str) -> None:
+    """Emit evaluator-readable output blocks to stdout."""
+    print(f"[START] {start_message}", flush=True)
+    print(f"[STEP] {step_message}", flush=True)
+    print(f"[END] {end_message}", flush=True)
 def main() -> int:
     """Run the inference workflow and always terminate successfully."""
+    start_message = format_start_message()
     step_message = "error handled: initialization_failed"
+    end_message = "task=python_code_review_env status=ok score=0.0000 done=true fallback=true"
+    result: Optional[Dict[str, Any]] = None
     try:
         model_name = safe_env("MODEL_NAME", DEFAULT_MODEL_NAME) or DEFAULT_MODEL_NAME
         client = create_client()
         result = run_env(client, model_name)
         step_message = format_step_message(result)
+        end_message = format_end_message(result)
     except BaseException as exc:
         step_message = f"error handled: {safe_text(exc, 'unexpected_failure').lower().replace(' ', '_')[:64]}"
+        end_message = format_end_message(result)
     finally:
         try:
+            emit_structured_output(start_message, step_message, end_message)
         except Exception:
             pass
     return 0
         main()
     except BaseException:
         try:
+            emit_structured_output(
+                format_start_message(),
+                "error handled: fatal_guard",
+                "task=python_code_review_env status=ok score=0.0000 done=true fallback=true",
+            )
         except Exception:
             pass
     sys.exit(0)

pytest-cache-files-1f62ra1g/container_sim/server/Dockerfile ADDED Viewed

	@@ -0,0 +1,32 @@

+FROM python:3.11-slim
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    gcc \
+    git \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+# Install Python dependencies
+COPY requirements.txt /app/server/requirements.txt
+RUN pip install --no-cache-dir -r /app/server/requirements.txt
+# Copy the self-contained server package
+COPY . /app/server
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+ENV HOST=0.0.0.0
+ENV PORT=8000
+ENV WORKERS=1
+ENV MAX_CONCURRENT_ENVS=16
+# Health check
+HEALTHCHECK --interval=30s --timeout=5s --start-period=15s --retries=3 \
+    CMD curl -f http://localhost:${PORT}/health || exit 1
+# Run FastAPI app
+EXPOSE ${PORT}
+CMD ["python", "-m", "server.app"]

pytest-cache-files-1f62ra1g/container_sim/server/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Server exports for the Python code review environment."""
+from .code_review_environment import CodeReviewEnvironment, PythonCodeReviewEnvironment, PythonEnvironment
+__all__ = ["PythonEnvironment", "PythonCodeReviewEnvironment", "CodeReviewEnvironment"]

pytest-cache-files-1f62ra1g/container_sim/server/app.py ADDED Viewed

	@@ -0,0 +1,127 @@

+"""FastAPI application for the Python code review environment."""
+from __future__ import annotations
+import os
+from fastapi import APIRouter, HTTPException
+from fastapi.responses import RedirectResponse
+try:
+    from compat import create_app
+    from models import (
+        HealthResponse,
+        PythonCodeReviewAction,
+        PythonCodeReviewObservation,
+        PythonCodeReviewState,
+        TaskDescriptor,
+        TaskGrade,
+    )
+except Exception:
+    from .compat import create_app
+    from .models import (
+        HealthResponse,
+        PythonCodeReviewAction,
+        PythonCodeReviewObservation,
+        PythonCodeReviewState,
+        TaskDescriptor,
+        TaskGrade,
+    )
+from server.env import PythonCodeReviewEnvironment
+try:
+    MAX_CONCURRENT_ENVS = max(int(os.getenv("MAX_CONCURRENT_ENVS", "16")), 1)
+except Exception:
+    MAX_CONCURRENT_ENVS = 16
+python_env = PythonCodeReviewEnvironment(verbose=False)
+app = create_app(
+    PythonCodeReviewEnvironment,
+    PythonCodeReviewAction,
+    PythonCodeReviewObservation,
+    max_concurrent_envs=MAX_CONCURRENT_ENVS,
+)
+router = APIRouter(tags=["python-code-review"])
+@router.get("/", include_in_schema=False)
+def root() -> RedirectResponse:
+    """Redirect root to API documentation."""
+    return RedirectResponse(url="/docs")
+@router.get("/health", response_model=HealthResponse)
+def health() -> HealthResponse:
+    """Health check endpoint for deployment monitoring."""
+    return python_env.health()
+@router.get("/tasks", response_model=list)
+def list_tasks() -> list:
+    """List all available deterministic tasks."""
+    return python_env.list_task_summaries()
+@router.get("/tasks/{task_id}", response_model=object)
+def get_task(task_id: str) -> object:
+    """Get a specific task by ID."""
+    try:
+        return python_env.get_task(task_id)
+    except ValueError as exc:
+        raise HTTPException(status_code=404, detail=str(exc)) from exc
+@router.post("/tasks/{task_id}/grade", response_model=TaskGrade)
+def grade_task(task_id: str, payload: PythonCodeReviewAction) -> TaskGrade:
+    """Grade code submission for a task without running an episode."""
+    if payload.action_type != "edit_code" or not payload.code:
+        raise HTTPException(
+            status_code=400,
+            detail="Requires action_type='edit_code' with code parameter."
+        )
+    try:
+        return python_env.grade_task_submission(task_id=task_id, code=payload.code)
+    except ValueError as exc:
+        raise HTTPException(status_code=404, detail=str(exc)) from exc
+@router.post("/state", response_model=PythonCodeReviewState)
+def get_state_post() -> RedirectResponse:
+    """Redirect POST /state to GET for compatibility."""
+    return RedirectResponse(url="/state", status_code=303)
+app.include_router(router)
+def _prioritize_route(path: str, methods: set[str]) -> None:
+    """Move a matching custom route ahead of default OpenEnv routes."""
+    try:
+        for index in range(len(app.router.routes) - 1, -1, -1):
+            route = app.router.routes[index]
+            route_path = getattr(route, "path", None)
+            route_methods = set(getattr(route, "methods", set()) or set())
+            if route_path == path and methods.issubset(route_methods):
+                app.router.routes.insert(0, app.router.routes.pop(index))
+                break
+    except Exception:
+        pass
+_prioritize_route("/health", {"GET"})
+def main(host: str = "0.0.0.0", port: int = 8000) -> None:
+    """Run the FastAPI application with uvicorn."""
+    import uvicorn
+    uvicorn.run(
+        app,
+        host=os.getenv("HOST", host),
+        port=int(os.getenv("PORT", str(port))),
+    )
+if __name__ == "__main__":
+    main()

pytest-cache-files-1f62ra1g/container_sim/server/code_review_env_environment.py ADDED Viewed

	@@ -0,0 +1,9 @@

+"""Compatibility shim for older imports."""
+try:
+    from server.code_review_environment import CodeReviewEnvironment
+except ModuleNotFoundError:  # pragma: no cover
+    from .code_review_environment import CodeReviewEnvironment
+__all__ = ["CodeReviewEnvironment"]

pytest-cache-files-1f62ra1g/container_sim/server/code_review_environment.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Compatibility wrapper for older imports."""
+from .env import CodeReviewEnvironment, PythonCodeReviewEnvironment, PythonEnvironment
+__all__ = ["CodeReviewEnvironment", "PythonCodeReviewEnvironment", "PythonEnvironment"]

pytest-cache-files-1f62ra1g/container_sim/server/compat.py ADDED Viewed

	@@ -0,0 +1,89 @@

+"""Compatibility helpers for OpenEnv and FastMCP runtime drift."""
+from __future__ import annotations
+import sys
+import types
+from typing import Any
+def install_openenv_fastmcp_compat() -> None:
+    """Patch FastMCP API differences so older OpenEnv builds keep importing."""
+    try:
+        import fastmcp  # type: ignore
+    except Exception:
+        return
+    try:
+        if not hasattr(fastmcp, "Client"):
+            class CompatClient:
+                """Minimal async MCP client used for legacy OpenEnv imports."""
+                def __init__(self, *args: Any, **kwargs: Any) -> None:
+                    self.args = args
+                    self.kwargs = kwargs
+                async def __aenter__(self) -> "CompatClient":
+                    return self
+                async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> bool:
+                    return False
+                async def list_tools(self) -> list[Any]:
+                    return []
+                async def call_tool(self, tool_name: str, arguments: dict[str, Any]) -> Any:
+                    raise RuntimeError(
+                        f"MCP client compatibility mode cannot call tool: {tool_name}"
+                    )
+            fastmcp.Client = CompatClient  # type: ignore[attr-defined]
+    except Exception:
+        pass
+    try:
+        client_pkg = sys.modules.get("fastmcp.client")
+        if client_pkg is None:
+            client_pkg = types.ModuleType("fastmcp.client")
+            sys.modules["fastmcp.client"] = client_pkg
+        client_mod = sys.modules.get("fastmcp.client.client")
+        if client_mod is None:
+            client_mod = types.ModuleType("fastmcp.client.client")
+            sys.modules["fastmcp.client.client"] = client_mod
+        if not hasattr(client_mod, "CallToolResult"):
+            class CallToolResult:
+                """Compatibility container for legacy OpenEnv response handling."""
+                def __init__(
+                    self,
+                    content: Any = None,
+                    structured_content: Any = None,
+                    meta: Any = None,
+                    data: Any = None,
+                    is_error: bool = False,
+                ) -> None:
+                    self.content = content
+                    self.structured_content = structured_content
+                    self.meta = meta
+                    self.data = data
+                    self.is_error = is_error
+            client_mod.CallToolResult = CallToolResult
+        client_pkg.client = client_mod  # type: ignore[attr-defined]
+    except Exception:
+        pass
+install_openenv_fastmcp_compat()
+from openenv.core.env_server.http_server import create_app as openenv_create_app
+from openenv.core.env_server.interfaces import Environment
+from openenv.core.env_server.types import Action, Observation, State
+create_app = openenv_create_app

pytest-cache-files-1f62ra1g/container_sim/server/env.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .env_safe import * # noqa: F401,F403

pytest-cache-files-1f62ra1g/container_sim/server/env_safe.py ADDED Viewed

	@@ -0,0 +1,505 @@

+"""Safe OpenEnv environment for deterministic Python code repair tasks."""
+from __future__ import annotations
+from typing import Any, Optional
+from uuid import uuid4
+try:
+    from compat import Environment
+    from graders import grade_task
+    from models import (
+        HealthResponse,
+        HistoryEntry,
+        PythonCodeReviewAction,
+        PythonCodeReviewObservation,
+        PythonCodeReviewState,
+        RewardDetails,
+        TaskGrade,
+    )
+    from tasks import TaskSpec, get_task as load_task, list_task_summaries, task_ids
+except Exception:
+    from .compat import Environment
+    from .graders import grade_task
+    from .models import (
+        HealthResponse,
+        HistoryEntry,
+        PythonCodeReviewAction,
+        PythonCodeReviewObservation,
+        PythonCodeReviewState,
+        RewardDetails,
+        TaskGrade,
+    )
+    from .tasks import TaskSpec, get_task as load_task, list_task_summaries, task_ids
+INVALID_ACTION_PENALTY = 0.10
+NO_PROGRESS_PENALTY = 0.08
+REPEATED_ACTION_PENALTY = 0.05
+BASE_STEP_PENALTY = 0.02
+ANALYZE_STEP_PENALTY = 0.01
+SUBMIT_COMPLETION_BONUS = 0.30
+TIMEOUT_PENALTY = 0.12
+VALID_ACTIONS = {"analyze_code", "edit_code", "run_tests", "submit_solution"}
+def _clamp(value: float, low: float = 0.0, high: float = 1.0) -> float:
+    """Clamp a scalar to a bounded numeric interval."""
+    try:
+        return max(low, min(high, float(value)))
+    except Exception:
+        return low
+def _safe_text(value: Any, default: str = "") -> str:
+    """Convert values into short stable strings."""
+    try:
+        text = str(value)
+    except Exception:
+        return default
+    text = " ".join(text.split())
+    return text[:240] if text else default
+class PythonCodeReviewEnvironment(
+    Environment[PythonCodeReviewAction, PythonCodeReviewObservation, PythonCodeReviewState]
+):
+    """Deterministic, bounded, evaluator-safe environment for code repair tasks."""
+    SUPPORTS_CONCURRENT_SESSIONS = True
+    def __init__(self, verbose: bool = False) -> None:
+        super().__init__()
+        self._verbose = bool(verbose)
+        self._task_order = self._safe_task_order()
+        self._task_cursor = -1
+        self._task: Optional[TaskSpec] = None
+        self._state = PythonCodeReviewState(episode_id=str(uuid4()))
+        self._done = False
+        self._last_status = "Call reset() to start."
+        self._last_reward = RewardDetails(value=0.0, reason="Environment initialized.")
+        self._metrics = self._blank_metrics()
+        self._last_action_type = ""
+    def reset(
+        self,
+        seed: Optional[int] = None,
+        episode_id: Optional[str] = None,
+        task_id: Optional[str] = None,
+        **_: object,
+    ) -> PythonCodeReviewObservation:
+        """Reset the environment for a deterministic task and return an observation."""
+        del seed
+        try:
+            self._reset_rubric()
+        except Exception:
+            pass
+        task = self._select_task(task_id)
+        self._task = task
+        self._done = False
+        self._metrics = self._blank_metrics()
+        self._last_action_type = ""
+        self._last_status = "Inspect the code, run checks, edit the code, then submit."
+        self._last_reward = RewardDetails(
+            value=0.0,
+            reason="Episode reset.",
+            prev_score=0.0,
+            curr_score=0.0,
+        )
+        self._state = PythonCodeReviewState(
+            episode_id=episode_id or str(uuid4()),
+            step_count=0,
+            task_id=task.task_id,
+            difficulty=task.difficulty,
+            task_kind=task.task_kind,
+            attempts_remaining=max(int(task.max_steps), 1),
+            current_code=task.starter_code,
+            errors="",
+            test_results="No checks run yet.",
+            history=[],
+            score=0.0,
+            done=False,
+        )
+        return self._build_observation()
+    def step(
+        self,
+        action: PythonCodeReviewAction,
+        timeout_s: Optional[float] = None,
+        **_: object,
+    ) -> PythonCodeReviewObservation:
+        """Execute one safe environment step and always return a valid observation."""
+        del timeout_s
+        try:
+            if self._task is None:
+                return self.reset()
+            if self._done:
+                self._last_status = "Episode already completed. Call reset() to continue."
+                self._last_reward = RewardDetails(
+                    value=-INVALID_ACTION_PENALTY,
+                    invalid_action_penalty=INVALID_ACTION_PENALTY,
+                    reason="Episode already completed.",
+                    prev_score=self._metrics["score"],
+                    curr_score=self._metrics["score"],
+                    code_changed=False,
+                )
+                return self._build_observation()
+            self._state.step_count += 1
+            action_type = _safe_text(getattr(action, "action_type", "analyze_code"), "analyze_code")
+            code = getattr(action, "code", None)
+            if action_type == "analyze_code":
+                self._handle_scored_action(action_type=action_type, candidate_code=self._state.current_code, include_hidden=False)
+            elif action_type == "run_tests":
+                self._handle_scored_action(action_type=action_type, candidate_code=self._state.current_code, include_hidden=False)
+            elif action_type == "edit_code":
+                self._handle_edit(code)
+            elif action_type == "submit_solution":
+                self._handle_scored_action(action_type=action_type, candidate_code=self._state.current_code, include_hidden=True)
+                self._done = True
+            else:
+                self._apply_invalid_action(f"Unsupported action_type '{action_type}'.")
+            self._state.attempts_remaining = max(self._task.max_steps - self._state.step_count, 0)
+            if self._state.attempts_remaining == 0 and not self._done:
+                self._auto_submit()
+            self._state.done = self._done
+            return self._build_observation()
+        except Exception as exc:
+            self._apply_invalid_action(f"Step failure handled: {_safe_text(exc, 'unknown_error')}")
+            self._state.done = self._done
+            return self._build_observation()
+    @property
+    def state(self) -> PythonCodeReviewState:
+        """Return a deep copy of the current environment state."""
+        try:
+            return self._state.model_copy(deep=True)
+        except Exception:
+            return PythonCodeReviewState(episode_id=str(uuid4()))
+    def list_task_summaries(self) -> list[object]:
+        """Return public task summaries."""
+        try:
+            return list_task_summaries()
+        except Exception:
+            return []
+    def get_task(self, task_id: str) -> object:
+        """Return a single public task descriptor."""
+        return self._select_task(task_id).to_descriptor()
+    def health(self) -> HealthResponse:
+        """Return a simple health response."""
+        return HealthResponse(task_count=len(self._task_order))
+    def grade_task_submission(self, task_id: str, code: str) -> TaskGrade:
+        """Grade a task submission outside an episode without raising."""
+        try:
+            task = self._select_task(task_id)
+            return self._safe_grade(task=task, candidate_code=code, include_hidden=True)
+        except Exception as exc:
+            return TaskGrade(score=0.0, details={"error": _safe_text(exc, "grading_failed")})
+    def run_tests(self, code: str, include_hidden: bool = False) -> tuple[float, dict[str, int], TaskGrade]:
+        """Run deterministic grading and return score plus test summary."""
+        task = self._task or self._select_task(None)
+        grade = self._safe_grade(task=task, candidate_code=code, include_hidden=include_hidden)
+        return (
+            _clamp(grade.score),
+            {"passed": int(grade.tests_passed), "total": int(grade.tests_total)},
+            grade,
+        )
+    def apply_action(self, action: PythonCodeReviewAction) -> str:
+        """Return the candidate code implied by the action."""
+        if getattr(action, "action_type", "") == "edit_code":
+            code = getattr(action, "code", None)
+            return str(code) if code is not None else self._state.current_code
+        return self._state.current_code
+    def compute_reward(
+        self,
+        action_type: str,
+        previous_metrics: dict[str, float],
+        current_metrics: dict[str, float],
+        grade: TaskGrade,
+        code_changed: bool,
+        invalid_action: bool = False,
+    ) -> RewardDetails:
+        """Compute a bounded dynamic reward with progress and efficiency shaping."""
+        prev_score = _clamp(previous_metrics.get("score", 0.0))
+        curr_score = _clamp(current_metrics.get("score", 0.0))
+        score_delta = curr_score - prev_score
+        test_delta = current_metrics.get("test_fraction", 0.0) - previous_metrics.get("test_fraction", 0.0)
+        syntax_delta = current_metrics.get("syntax_score", 0.0) - previous_metrics.get("syntax_score", 0.0)
+        quality_delta = current_metrics.get("quality_score", 0.0) - previous_metrics.get("quality_score", 0.0)
+        step_penalty = BASE_STEP_PENALTY + (ANALYZE_STEP_PENALTY if action_type == "analyze_code" else 0.0)
+        repeated_penalty = REPEATED_ACTION_PENALTY if action_type == self._last_action_type else 0.0
+        no_progress = (
+            score_delta <= 1e-9
+            and test_delta <= 1e-9
+            and syntax_delta <= 1e-9
+            and quality_delta <= 1e-9
+            and not code_changed
+        )
+        stagnation_penalty = NO_PROGRESS_PENALTY if no_progress and not invalid_action else 0.0
+        regression_penalty = max(-score_delta, 0.0) * 0.6 + repeated_penalty + step_penalty
+        invalid_penalty = INVALID_ACTION_PENALTY if invalid_action else 0.0
+        timeout_penalty = TIMEOUT_PENALTY if bool(grade.timed_out) else 0.0
+        progress_reward = max(score_delta, 0.0) * 0.7
+        syntax_reward = max(syntax_delta, 0.0) * 0.5
+        test_reward = max(test_delta, 0.0) * 1.0
+        quality_bonus = max(quality_delta, 0.0) * 0.2
+        correctness_bonus = SUBMIT_COMPLETION_BONUS if action_type == "submit_solution" and curr_score >= 0.999 else 0.0
+        reward_value = (
+            progress_reward
+            + syntax_reward
+            + test_reward
+            + quality_bonus
+            + correctness_bonus
+            - stagnation_penalty
+            - regression_penalty
+            - invalid_penalty
+            - timeout_penalty
+        )
+        reward_value = max(-1.0, min(1.0, round(reward_value, 6)))
+        return RewardDetails(
+            value=reward_value,
+            syntax_reward=round(syntax_reward, 6),
+            test_reward=round(test_reward, 6),
+            quality_bonus=round(quality_bonus, 6),
+            correctness_bonus=round(correctness_bonus, 6),
+            progress_delta=round(progress_reward, 6),
+            stagnation_penalty=round(stagnation_penalty, 6),
+            regression_penalty=round(regression_penalty, 6),
+            invalid_action_penalty=round(invalid_penalty, 6),
+            timeout_penalty=round(timeout_penalty, 6),
+            reason=f"{action_type} reward computed safely",
+            prev_score=round(prev_score, 6),
+            curr_score=round(curr_score, 6),
+            code_changed=bool(code_changed),
+        )
+    def _safe_task_order(self) -> list[str]:
+        """Load deterministic task ids with a hard fallback."""
+        try:
+            loaded = list(task_ids())
+            if loaded:
+                return [str(task_id) for task_id in loaded]
+        except Exception:
+            pass
+        return ["syntax-fix-easy", "bug-fix-medium", "optimization-hard"]
+    def _blank_metrics(self) -> dict[str, float]:
+        """Return an empty metric snapshot."""
+        return {
+            "score": 0.0,
+            "test_fraction": 0.0,
+            "syntax_score": 0.0,
+            "quality_score": 0.0,
+        }
+    def _select_task(self, task_id: Optional[str]) -> TaskSpec:
+        """Select the requested task or advance deterministically."""
+        try:
+            if task_id:
+                task = load_task(task_id)
+                if task.task_id in self._task_order:
+                    self._task_cursor = self._task_order.index(task.task_id)
+                return task
+        except Exception:
+            pass
+        try:
+            self._task_cursor = (self._task_cursor + 1) % len(self._task_order)
+            return load_task(self._task_order[self._task_cursor])
+        except Exception:
+            return load_task("syntax-fix-easy")
+    def _safe_grade(self, task: TaskSpec, candidate_code: str, include_hidden: bool) -> TaskGrade:
+        """Run grading without allowing exceptions to escape."""
+        try:
+            return grade_task(candidate_code, task, include_hidden=include_hidden)
+        except Exception as exc:
+            return TaskGrade(
+                score=0.0,
+                syntax_score=0.0,
+                tests_passed=0,
+                tests_total=max(len(task.visible_tests), 1),
+                details={"compile_error": "", "error": _safe_text(exc, "grading_failed")},
+            )
+    def _metrics_from_grade(self, grade: TaskGrade) -> dict[str, float]:
+        """Derive normalized reward metrics from a grading result."""
+        tests_total = max(int(grade.tests_total), 0)
+        tests_passed = max(int(grade.tests_passed), 0)
+        test_fraction = (tests_passed / tests_total) if tests_total else _clamp(grade.syntax_score)
+        return {
+            "score": _clamp(grade.score),
+            "test_fraction": _clamp(test_fraction),
+            "syntax_score": _clamp(grade.syntax_score),
+            "quality_score": _clamp(grade.quality_score),
+        }
+    def _format_test_results(self, grade: TaskGrade, include_hidden: bool) -> str:
+        """Format test execution results for the observation."""
+        compile_error = _safe_text(grade.details.get("compile_error", ""), "")
+        scope = "all checks" if include_hidden else "visible checks"
+        if compile_error:
+            return f"{scope}: compile error: {compile_error}"
+        if grade.timed_out:
+            return f"{scope}: execution timed out"
+        if self._task and self._task.task_kind == "syntax_fix":
+            return "visible checks: code compiles successfully"
+        return f"{scope}: {int(grade.tests_passed)}/{int(grade.tests_total)} passing"
+    def _build_status(self, action_type: str, grade: TaskGrade) -> str:
+        """Build a human-readable status message."""
+        if action_type == "submit_solution":
+            return f"Solution submitted. Final score: {_clamp(grade.score):.3f}"
+        if action_type == "edit_code":
+            if grade.details.get("compile_error"):
+                return "Code updated, but syntax issues remain."
+            return "Code updated and evaluated."
+        if action_type == "run_tests":
+            return "Test run completed."
+        if action_type == "analyze_code":
+            return "Analysis completed."
+        return "Action handled safely."
+    def _apply_grade_to_state(self, grade: TaskGrade, include_hidden: bool) -> None:
+        """Update environment state from the latest grading result."""
+        compile_error = _safe_text(grade.details.get("compile_error", ""), "")
+        self._state.score = _clamp(grade.score)
+        self._state.errors = compile_error
+        self._state.test_results = self._format_test_results(grade, include_hidden=include_hidden)
+    def _handle_scored_action(self, action_type: str, candidate_code: str, include_hidden: bool) -> None:
+        """Grade code, update state, and compute reward for a valid action."""
+        task = self._task or self._select_task(None)
+        previous_metrics = dict(self._metrics)
+        prior_code = self._state.current_code
+        code_changed = candidate_code.strip() != prior_code.strip()
+        if action_type == "edit_code":
+            self._state.current_code = candidate_code
+        grade = self._safe_grade(task=task, candidate_code=self._state.current_code, include_hidden=include_hidden)
+        current_metrics = self._metrics_from_grade(grade)
+        self._apply_grade_to_state(grade, include_hidden=include_hidden)
+        self._last_reward = self.compute_reward(
+            action_type=action_type,
+            previous_metrics=previous_metrics,
+            current_metrics=current_metrics,
+            grade=grade,
+            code_changed=code_changed,
+            invalid_action=False,
+        )
+        self._last_status = self._build_status(action_type, grade)
+        self._metrics = current_metrics
+        self._last_action_type = action_type
+        self._append_history(action_type, self._last_status, self._last_reward.value)
+    def _handle_edit(self, code: Optional[str]) -> None:
+        """Validate edit input and evaluate the new candidate code."""
+        safe_code = (code or "").strip()
+        if not safe_code:
+            self._apply_invalid_action("edit_code requires code parameter.")
+            return
+        self._handle_scored_action(action_type="edit_code", candidate_code=safe_code, include_hidden=False)
+    def _apply_invalid_action(self, reason: str) -> None:
+        """Record an invalid action without crashing the episode."""
+        previous_metrics = dict(self._metrics)
+        grade = TaskGrade(score=previous_metrics["score"], syntax_score=previous_metrics["syntax_score"])
+        self._last_reward = self.compute_reward(
+            action_type="invalid",
+            previous_metrics=previous_metrics,
+            current_metrics=previous_metrics,
+            grade=grade,
+            code_changed=False,
+            invalid_action=True,
+        )
+        self._last_status = reason
+        self._append_history("analyze_code", reason, self._last_reward.value)
+    def _auto_submit(self) -> None:
+        """Finalize the episode when attempts are exhausted."""
+        task = self._task or self._select_task(None)
+        grade = self._safe_grade(task=task, candidate_code=self._state.current_code, include_hidden=True)
+        self._apply_grade_to_state(grade, include_hidden=True)
+        self._done = True
+        self._state.done = True
+        self._last_status = f"Auto-submitted. Final score: {_clamp(grade.score):.3f}"
+    def _append_history(self, action_type: str, status: str, reward: float) -> None:
+        """Append one action record to the episode history."""
+        try:
+            stable_action = action_type if action_type in VALID_ACTIONS else "analyze_code"
+            self._state.history.append(
+                HistoryEntry(
+                    step=max(int(self._state.step_count), 0),
+                    action_type=stable_action,
+                    status=_safe_text(status, "handled"),
+                    reward=float(reward),
+                )
+            )
+        except Exception:
+            pass
+    def _build_observation(self) -> PythonCodeReviewObservation:
+        """Build a valid observation from current state."""
+        task = self._task
+        try:
+            return PythonCodeReviewObservation(
+                task_id=self._state.task_id or "",
+                title=task.title if task else "",
+                difficulty=self._state.difficulty or "easy",
+                task_kind=self._state.task_kind,
+                task_description=task.task_description if task else "",
+                current_code=self._state.current_code,
+                errors=self._state.errors,
+                test_results=self._state.test_results,
+                visible_tests=list(task.visible_tests) if task else [],
+                history=list(self._state.history),
+                attempts_remaining=max(int(self._state.attempts_remaining), 0),
+                last_action_status=self._last_status,
+                score=_clamp(self._state.score),
+                reward_details=self._last_reward,
+                reward=self._last_reward.value,
+                done=bool(self._state.done),
+                metadata={
+                    "prev_score": self._last_reward.prev_score,
+                    "curr_score": self._last_reward.curr_score,
+                },
+            )
+        except Exception as exc:
+            return PythonCodeReviewObservation(
+                task_id=self._state.task_id or "",
+                title="",
+                difficulty="easy",
+                task_kind=None,
+                task_description="",
+                current_code=getattr(self._state, "current_code", ""),
+                errors=_safe_text(exc, "observation_build_failed"),
+                test_results="visible checks: unavailable",
+                visible_tests=[],
+                history=[],
+                attempts_remaining=0,
+                last_action_status="Observation fallback returned safely.",
+                score=0.0,
+                reward_details=RewardDetails(value=0.0, reason="Observation fallback."),
+                reward=0.0,
+                done=bool(getattr(self._state, "done", False)),
+                metadata={},
+            )
+PythonEnvironment = PythonCodeReviewEnvironment
+CodeReviewEnvironment = PythonCodeReviewEnvironment

pytest-cache-files-1f62ra1g/container_sim/server/graders/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+"""Deterministic graders for self-contained server builds."""
+from .common import clamp_score
+from .optimization import grade_optimization_task
+from .pytest_runner import PytestExecution, run_pytest_suite
+from .syntax import grade_bug_fix_task, grade_syntax_task, grade_task
+__all__ = [
+    "PytestExecution",
+    "clamp_score",
+    "grade_bug_fix_task",
+    "grade_optimization_task",
+    "grade_syntax_task",
+    "grade_task",
+    "run_pytest_suite",
+]

pytest-cache-files-1f62ra1g/container_sim/server/graders/common.py ADDED Viewed

	@@ -0,0 +1,69 @@

+"""Shared deterministic scoring helpers."""
+from __future__ import annotations
+import ast
+import difflib
+import traceback
+from typing import Tuple
+def clamp_score(value: float) -> float:
+    return max(0.0, min(1.0, round(value, 6)))
+def syntax_error_message(code: str) -> str:
+    try:
+        ast.parse(code)
+    except SyntaxError as exc:
+        return f"{exc.msg} (line {exc.lineno}, column {exc.offset})"
+    except Exception:
+        return traceback.format_exc(limit=1).strip()
+    return ""
+def compiles(code: str) -> bool:
+    try:
+        compile(code, "<candidate>", "exec")
+    except Exception:
+        return False
+    return True
+def normalized_diff_score(code: str, reference_code: str) -> float:
+    ratio = difflib.SequenceMatcher(
+        a="".join(code.split()),
+        b="".join(reference_code.split()),
+    ).ratio()
+    return clamp_score(ratio)
+def style_score(code: str, max_line_length: int = 88) -> float:
+    lines = code.splitlines() or [""]
+    line_length_ok = sum(1 for line in lines if len(line) <= max_line_length) / len(lines)
+    tab_ok = 1.0 if all("\t" not in line for line in lines) else 0.0
+    trailing_ws_ok = 1.0 if all(line == line.rstrip() for line in lines) else 0.0
+    return clamp_score((line_length_ok * 0.6) + (tab_ok * 0.2) + (trailing_ws_ok * 0.2))
+def nested_loop_depth(tree: ast.AST) -> int:
+    best = 0
+    def walk(node: ast.AST, depth: int) -> None:
+        nonlocal best
+        if isinstance(node, (ast.For, ast.AsyncFor, ast.While)):
+            depth += 1
+            best = max(best, depth)
+        for child in ast.iter_child_nodes(node):
+            walk(child, depth)
+    walk(tree, 0)
+    return best
+def compile_tree(code: str) -> Tuple[ast.AST | None, str]:
+    try:
+        return ast.parse(code), ""
+    except SyntaxError as exc:
+        return None, f"{exc.msg} (line {exc.lineno}, column {exc.offset})"

pytest-cache-files-1f62ra1g/container_sim/server/graders/optimization.py ADDED Viewed

	@@ -0,0 +1,135 @@

+"""Deterministic grading for optimization tasks."""
+from __future__ import annotations
+import json
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+from .common import clamp_score, compile_tree, nested_loop_depth, style_score
+from .pytest_runner import run_pytest_suite
+from ..models import TaskGrade
+from ..tasks.task_bank import TaskSpec
+def _benchmark_script(task: TaskSpec) -> str:
+    return f"""import json
+import time
+from candidate import {task.benchmark_entrypoint}
+{task.benchmark_builder}
+events = build_benchmark_events()
+start = time.perf_counter()
+for _ in range({task.benchmark_repeats}):
+    result = {task.benchmark_entrypoint}(events)
+elapsed = time.perf_counter() - start
+Path = __import__("pathlib").Path
+Path("benchmark.json").write_text(json.dumps({{"elapsed": elapsed, "rows": len(result)}}), encoding="utf-8")
+"""
+def benchmark_runtime(candidate_code: str, task: TaskSpec) -> tuple[float, bool, str]:
+    assert task.benchmark_entrypoint is not None
+    try:
+        with tempfile.TemporaryDirectory(prefix="python-code-review-bench-") as temp_dir:
+            temp_path = Path(temp_dir)
+            (temp_path / "candidate.py").write_text(candidate_code, encoding="utf-8")
+            (temp_path / "starter.py").write_text(task.starter_code, encoding="utf-8")
+            (temp_path / "candidate_runner.py").write_text(_benchmark_script(task), encoding="utf-8")
+            starter_script = _benchmark_script(task).replace("from candidate import", "from starter import")
+            (temp_path / "starter_runner.py").write_text(starter_script, encoding="utf-8")
+            try:
+                starter_run = subprocess.run(
+                    [sys.executable, "starter_runner.py"],
+                    cwd=temp_path,
+                    capture_output=True,
+                    text=True,
+                    timeout=task.benchmark_timeout_s,
+                    check=False,
+                )
+                starter_payload = json.loads((temp_path / "benchmark.json").read_text(encoding="utf-8"))
+                candidate_run = subprocess.run(
+                    [sys.executable, "candidate_runner.py"],
+                    cwd=temp_path,
+                    capture_output=True,
+                    text=True,
+                    timeout=task.benchmark_timeout_s,
+                    check=False,
+                )
+                candidate_payload = json.loads((temp_path / "benchmark.json").read_text(encoding="utf-8"))
+            except subprocess.TimeoutExpired as exc:
+                output = (exc.stdout or "") + (exc.stderr or "")
+                return 0.0, True, (output or "benchmark timed out").strip()
+            except Exception as exc:
+                return 0.0, False, str(exc)
+            starter_elapsed = max(float(starter_payload["elapsed"]), 1e-9)
+            candidate_elapsed = max(float(candidate_payload["elapsed"]), 1e-9)
+            speedup = starter_elapsed / candidate_elapsed
+            runtime_score = clamp_score(min((speedup - 1.0) / 3.0, 1.0))
+            output = "\n".join(
+                part
+                for part in [
+                    starter_run.stdout.strip(),
+                    starter_run.stderr.strip(),
+                    candidate_run.stdout.strip(),
+                    candidate_run.stderr.strip(),
+                    f"starter={starter_elapsed:.6f}s candidate={candidate_elapsed:.6f}s speedup={speedup:.2f}x",
+                ]
+                if part
+            )
+            return runtime_score, False, output
+    except Exception as exc:
+        return 0.0, False, str(exc)
+def ast_quality_score(code: str, task: TaskSpec) -> float:
+    tree, _ = compile_tree(code)
+    if tree is None:
+        return 0.0
+    import ast
+    function_node = next((node for node in tree.body if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef))), None)
+    docstring_points = 0.2 if function_node and ast.get_docstring(function_node, clean=False) else 0.0
+    nested_points = 0.4 if nested_loop_depth(tree) <= 1 else 0.0
+    marker_points = 0.0
+    for marker in task.expected_quality_markers:
+        if marker in code:
+            marker_points += 0.2
+    return clamp_score(docstring_points + nested_points + marker_points)
+def grade_optimization_task(candidate_code: str, task: TaskSpec) -> TaskGrade:
+    execution = run_pytest_suite(candidate_code, [*task.visible_tests, *task.hidden_tests], timeout_s=task.benchmark_timeout_s)
+    test_fraction = execution.passed / execution.total if execution.total else 0.0
+    if execution.timed_out:
+        return TaskGrade(score=0.0, tests_passed=execution.passed, tests_total=execution.total, timed_out=True, details={"tests": execution.output})
+    runtime_score, timed_out, benchmark_output = benchmark_runtime(candidate_code, task)
+    if timed_out:
+        return TaskGrade(score=0.0, tests_passed=execution.passed, tests_total=execution.total, timed_out=True, details={"tests": execution.output, "benchmark": benchmark_output})
+    quality_score = ast_quality_score(candidate_code, task)
+    pep8_score = style_score(candidate_code, task.style_max_line_length)
+    score = clamp_score((0.5 * test_fraction) + (0.3 * runtime_score) + (0.15 * quality_score) + (0.05 * pep8_score))
+    return TaskGrade(
+        score=score,
+        syntax_score=1.0,
+        tests_passed=execution.passed,
+        tests_total=execution.total,
+        quality_score=quality_score,
+        runtime_score=runtime_score,
+        details={
+            "tests": execution.output,
+            "benchmark": benchmark_output,
+            "test_fraction": round(test_fraction, 4),
+            "runtime_score": round(runtime_score, 4),
+            "style_score": round(pep8_score, 4),
+        },
+    )

pytest-cache-files-1f62ra1g/container_sim/server/graders/pytest_runner.py ADDED Viewed

	@@ -0,0 +1,121 @@

+"""Helpers for deterministic pytest execution in temp sandboxes."""
+from __future__ import annotations
+import json
+import subprocess
+import sys
+import tempfile
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Iterable
+@dataclass(frozen=True)
+class PytestExecution:
+    passed: int
+    failed: int
+    total: int
+    timed_out: bool
+    output: str
+def _test_module_source(tests: Iterable[str]) -> str:
+    blocks: list[str] = ["from candidate import *  # noqa: F401,F403"]
+    for index, test in enumerate(tests, start=1):
+        snippet = str(test).strip()
+        if not snippet:
+            continue
+        if snippet.startswith("def test_"):
+            blocks.append(snippet)
+            continue
+        blocks.append(
+            "\n".join(
+                [
+                    f"def test_case_{index:03d}():",
+                    f"    assert {snippet}",
+                ]
+            )
+        )
+    return "\n\n".join(blocks) or "def test_placeholder():\n    assert True\n"
+def _runner_script() -> str:
+    return """import json
+import pathlib
+import pytest
+class Collector:
+    def __init__(self) -> None:
+        self.passed = 0
+        self.failed = 0
+    def pytest_runtest_logreport(self, report):
+        if report.when != "call":
+            return
+        if report.passed:
+            self.passed += 1
+        elif report.failed:
+            self.failed += 1
+collector = Collector()
+exit_code = pytest.main(["-q", "test_candidate.py"], plugins=[collector])
+payload = {
+    "passed": collector.passed,
+    "failed": collector.failed,
+    "exit_code": int(exit_code),
+}
+pathlib.Path("pytest_results.json").write_text(json.dumps(payload), encoding="utf-8")
+"""
+def run_pytest_suite(candidate_code: str, tests: Iterable[str], timeout_s: float = 3.0) -> PytestExecution:
+    test_cases = list(tests)
+    try:
+        with tempfile.TemporaryDirectory(prefix="python-code-review-") as temp_dir:
+            temp_path = Path(temp_dir)
+            (temp_path / "candidate.py").write_text(candidate_code, encoding="utf-8")
+            (temp_path / "test_candidate.py").write_text(_test_module_source(test_cases), encoding="utf-8")
+            (temp_path / "runner.py").write_text(_runner_script(), encoding="utf-8")
+            try:
+                completed = subprocess.run(
+                    [sys.executable, "runner.py"],
+                    cwd=temp_path,
+                    capture_output=True,
+                    text=True,
+                    timeout=timeout_s,
+                    check=False,
+                )
+            except subprocess.TimeoutExpired as exc:
+                output = (exc.stdout or "") + (exc.stderr or "")
+                return PytestExecution(
+                    passed=0,
+                    failed=max(len(test_cases), 1),
+                    total=max(len(test_cases), 1),
+                    timed_out=True,
+                    output=(output or "pytest timed out").strip(),
+                )
+            result_path = temp_path / "pytest_results.json"
+            if not result_path.exists():
+                output = (completed.stdout or "") + (completed.stderr or "")
+                total = max(len(test_cases), 1)
+                return PytestExecution(0, total, total, False, output.strip())
+            try:
+                payload = json.loads(result_path.read_text(encoding="utf-8"))
+            except Exception as exc:
+                output = ((completed.stdout or "") + (completed.stderr or "")).strip()
+                return PytestExecution(0, max(len(test_cases), 1), max(len(test_cases), 1), False, (output or str(exc)).strip())
+            passed = int(payload.get("passed", 0))
+            failed = int(payload.get("failed", 0))
+            total = max(passed + failed, len(test_cases))
+            output = ((completed.stdout or "") + (completed.stderr or "")).strip()
+            return PytestExecution(passed, failed, total, False, output)
+    except Exception as exc:
+        return PytestExecution(0, max(len(test_cases), 1), max(len(test_cases), 1), False, str(exc))

pytest-cache-files-1f62ra1g/container_sim/server/graders/syntax.py ADDED Viewed

	@@ -0,0 +1,60 @@

+"""Task graders for syntax and bug-fix tasks."""
+from __future__ import annotations
+from .common import clamp_score, compiles, normalized_diff_score, style_score, syntax_error_message
+from .optimization import grade_optimization_task
+from .pytest_runner import run_pytest_suite
+from ..models import TaskGrade
+from ..tasks.task_bank import TaskSpec
+def grade_syntax_task(candidate_code: str, task: TaskSpec) -> TaskGrade:
+    error = syntax_error_message(candidate_code)
+    diff_score = normalized_diff_score(candidate_code, task.reference_code)
+    style_base = style_score(candidate_code, task.style_max_line_length)
+    if not error:
+        return TaskGrade(score=1.0, syntax_score=1.0, quality_score=style_base, details={"compile_error": ""})
+    partial = clamp_score(0.15 + (0.55 * diff_score))
+    return TaskGrade(score=partial, syntax_score=0.0, quality_score=diff_score * style_base, details={"compile_error": error})
+def grade_bug_fix_task(candidate_code: str, task: TaskSpec, include_hidden: bool = True) -> TaskGrade:
+    if not compiles(candidate_code):
+        error = syntax_error_message(candidate_code)
+        return TaskGrade(score=0.0, syntax_score=0.0, details={"compile_error": error})
+    tests = list(task.visible_tests)
+    if include_hidden:
+        tests.extend(task.hidden_tests)
+    execution = run_pytest_suite(candidate_code, tests, timeout_s=3.0)
+    if execution.timed_out:
+        return TaskGrade(
+            score=0.0,
+            syntax_score=1.0,
+            tests_passed=execution.passed,
+            tests_total=execution.total,
+            timed_out=True,
+            details={"compile_error": "", "tests": execution.output},
+        )
+    pass_fraction = execution.passed / execution.total if execution.total else 0.0
+    quality = style_score(candidate_code, task.style_max_line_length)
+    return TaskGrade(
+        score=clamp_score(pass_fraction),
+        syntax_score=1.0,
+        tests_passed=execution.passed,
+        tests_total=execution.total,
+        quality_score=quality,
+        details={"compile_error": "", "tests": execution.output},
+    )
+def grade_task(candidate_code: str, task: TaskSpec, include_hidden: bool = True) -> TaskGrade:
+    if task.task_kind == "syntax_fix":
+        return grade_syntax_task(candidate_code, task)
+    if task.task_kind == "bug_fix":
+        return grade_bug_fix_task(candidate_code, task, include_hidden=include_hidden)
+    return grade_optimization_task(candidate_code, task)

pytest-cache-files-1f62ra1g/container_sim/server/grading.py ADDED Viewed

	@@ -0,0 +1,147 @@

+"""Deterministic grading helpers for PR-review tasks."""
+from __future__ import annotations
+import re
+from dataclasses import dataclass
+from typing import Iterable, List, Optional, Sequence, Set
+try:
+    from models import ReviewFinding, TaskGrade
+    from server.task_bank import RubricIssue, TaskSpec
+except ModuleNotFoundError:  # pragma: no cover
+    from ..models import ReviewFinding, TaskGrade
+    from .task_bank import RubricIssue, TaskSpec
+FALSE_POSITIVE_PENALTY = 0.10
+DUPLICATE_PENALTY = 0.05
+@dataclass(frozen=True)
+class FindingMatch:
+    """Result of matching one finding against the rubric."""
+    issue_id: Optional[str]
+    duplicate: bool = False
+def finding_fingerprint(finding: ReviewFinding) -> str:
+    """Build a deterministic fingerprint for duplicate detection."""
+    text = " ".join(
+        [
+            finding.file_path,
+            str(finding.line or 0),
+            finding.category,
+            finding.severity,
+            finding.title,
+            finding.explanation,
+            finding.suggested_fix,
+        ]
+    )
+    return "|".join(sorted(tokens(text)))
+def match_finding(
+    finding: ReviewFinding,
+    task: TaskSpec,
+    matched_issue_ids: Set[str],
+    seen_fingerprints: Set[str],
+) -> FindingMatch:
+    """Match one finding against the remaining rubric issues."""
+    fingerprint = finding_fingerprint(finding)
+    if fingerprint in seen_fingerprints:
+        return FindingMatch(issue_id=None, duplicate=True)
+    for issue in task.rubric_issues:
+        if issue.issue_id in matched_issue_ids:
+            continue
+        if finding_matches_issue(finding, issue):
+            return FindingMatch(issue_id=issue.issue_id)
+    return FindingMatch(issue_id=None)
+def finding_matches_issue(finding: ReviewFinding, issue: RubricIssue) -> bool:
+    """Return True when a finding deterministically matches a rubric issue."""
+    if finding.file_path != issue.file_path:
+        return False
+    if finding.category != issue.category:
+        return False
+    if finding.severity != issue.severity:
+        return False
+    if finding.line is None or abs(finding.line - issue.line) > 2:
+        return False
+    finding_tokens = tokens(
+        " ".join([finding.title, finding.explanation, finding.suggested_fix])
+    )
+    keyword_hits = sum(1 for keyword in issue.keywords if keyword in finding_tokens)
+    return keyword_hits >= issue.min_keyword_hits
+def score_task(
+    task: TaskSpec,
+    matched_issue_ids: Iterable[str],
+    false_positives: int = 0,
+    duplicate_findings: int = 0,
+) -> TaskGrade:
+    """Score a task from cumulative episode state."""
+    matched_set = set(matched_issue_ids)
+    matched_weight = sum(
+        issue.weight for issue in task.rubric_issues if issue.issue_id in matched_set
+    )
+    raw_score = matched_weight
+    raw_score -= false_positives * FALSE_POSITIVE_PENALTY
+    raw_score -= duplicate_findings * DUPLICATE_PENALTY
+    score = max(0.0, min(1.0, round(raw_score, 6)))
+    return TaskGrade(
+        score=score,
+        matched_issue_ids=sorted(matched_set),
+        false_positives=false_positives,
+        duplicate_findings=duplicate_findings,
+        matched_weight=min(1.0, round(matched_weight, 6)),
+    )
+def grade_findings(task: TaskSpec, findings: Sequence[ReviewFinding]) -> TaskGrade:
+    """Offline-grade a batch of findings for one task."""
+    matched_issue_ids: Set[str] = set()
+    seen_fingerprints: Set[str] = set()
+    false_positives = 0
+    duplicate_findings = 0
+    for finding in findings:
+        result = match_finding(
+            finding=finding,
+            task=task,
+            matched_issue_ids=matched_issue_ids,
+            seen_fingerprints=seen_fingerprints,
+        )
+        fingerprint = finding_fingerprint(finding)
+        if result.duplicate:
+            duplicate_findings += 1
+            continue
+        seen_fingerprints.add(fingerprint)
+        if result.issue_id is None:
+            false_positives += 1
+            continue
+        matched_issue_ids.add(result.issue_id)
+    return score_task(
+        task=task,
+        matched_issue_ids=matched_issue_ids,
+        false_positives=false_positives,
+        duplicate_findings=duplicate_findings,
+    )
+def tokens(text: str) -> Set[str]:
+    """Normalize free text into deterministic comparison tokens."""
+    return set(re.findall(r"[a-z0-9_]+", text.lower()))

pytest-cache-files-1f62ra1g/container_sim/server/models.py ADDED Viewed

	@@ -0,0 +1,149 @@

+"""Typed models for the self-contained server package."""
+from __future__ import annotations
+from typing import Any, Dict, List, Literal, Optional
+from pydantic import BaseModel, Field
+from .compat import Action, Observation, State
+Difficulty = Literal["easy", "medium", "hard"]
+TaskKind = Literal["syntax_fix", "bug_fix", "optimization"]
+ActionType = Literal["analyze_code", "edit_code", "run_tests", "submit_solution"]
+Category = Literal["bug", "security", "performance", "maintainability", "style", "testing"]
+Severity = Literal["critical", "warning", "info"]
+class HistoryEntry(BaseModel):
+    step: int = Field(..., ge=0)
+    action_type: ActionType
+    status: str
+    reward: float
+class RewardDetails(BaseModel):
+    value: float
+    syntax_reward: float = 0.0
+    test_reward: float = 0.0
+    quality_bonus: float = 0.0
+    correctness_bonus: float = 0.0
+    progress_delta: float = 0.0
+    stagnation_penalty: float = 0.0
+    regression_penalty: float = 0.0
+    invalid_action_penalty: float = 0.0
+    timeout_penalty: float = 0.0
+    reason: str
+    prev_score: float = 0.0
+    curr_score: float = 0.0
+    code_changed: bool = False
+class PythonCodeReviewAction(Action):
+    action_type: ActionType
+    code: Optional[str] = None
+class PythonCodeReviewObservation(Observation):
+    task_id: str
+    title: str = ""
+    difficulty: Difficulty
+    task_kind: Optional[TaskKind] = None
+    task_description: str
+    current_code: str
+    errors: str
+    test_results: str
+    visible_tests: List[str] = Field(default_factory=list)
+    history: List[HistoryEntry] = Field(default_factory=list)
+    attempts_remaining: int = Field(..., ge=0)
+    last_action_status: str = ""
+    score: float = Field(..., ge=0.0, le=1.0)
+    reward_details: RewardDetails = Field(
+        default_factory=lambda: RewardDetails(value=0.0, reason="Reset")
+    )
+class PythonCodeReviewState(State):
+    episode_id: str
+    step_count: int = Field(default=0, ge=0)
+    task_id: Optional[str] = None
+    difficulty: Optional[Difficulty] = None
+    task_kind: Optional[TaskKind] = None
+    attempts_remaining: int = Field(default=0, ge=0)
+    current_code: str = ""
+    errors: str = ""
+    test_results: str = ""
+    history: List[HistoryEntry] = Field(default_factory=list)
+    score: float = Field(default=0.0, ge=0.0, le=1.0)
+    done: bool = False
+class TaskDescriptor(BaseModel):
+    task_id: str
+    title: str
+    difficulty: Difficulty
+    task_kind: Optional[TaskKind] = None
+    task_description: str = ""
+    starter_code: str = ""
+    visible_tests: List[str] = Field(default_factory=list)
+    goal: str = ""
+    repo_summary: str = ""
+    changed_files: List[str] = Field(default_factory=list)
+    available_files: List[str] = Field(default_factory=list)
+    max_steps: int = Field(..., ge=1)
+class TaskSummary(BaseModel):
+    task_id: str
+    difficulty: Difficulty
+    title: str
+    goal: str = ""
+class ReviewFinding(BaseModel):
+    title: str
+    file_path: str = ""
+    line: Optional[int] = Field(default=None, ge=1)
+    category: Category = "bug"
+    severity: Severity = "warning"
+    rationale: str = ""
+    recommendation: str = ""
+    rule_id: str = ""
+    @property
+    def explanation(self) -> str:
+        return self.rationale
+    @property
+    def suggested_fix(self) -> str:
+        return self.recommendation
+class DirectReviewResponse(BaseModel):
+    issues: List[ReviewFinding] = Field(default_factory=list)
+    summary: str = ""
+    score: float = Field(default=0.0, ge=0.0, le=1.0)
+    improved_code: Optional[str] = None
+class TaskGrade(BaseModel):
+    score: float = Field(..., ge=0.0, le=1.0)
+    syntax_score: float = Field(default=0.0, ge=0.0, le=1.0)
+    tests_passed: int = Field(default=0, ge=0)
+    tests_total: int = Field(default=0, ge=0)
+    quality_score: float = Field(default=0.0, ge=0.0, le=1.0)
+    runtime_score: float = Field(default=0.0, ge=0.0, le=1.0)
+    timed_out: bool = False
+    matched_issue_ids: List[str] = Field(default_factory=list)
+    false_positives: int = Field(default=0, ge=0)
+    duplicate_findings: int = Field(default=0, ge=0)
+    matched_weight: float = Field(default=0.0, ge=0.0, le=1.0)
+    details: Dict[str, Any] = Field(default_factory=dict)
+class HealthResponse(BaseModel):
+    status: Literal["ok"] = "ok"
+    environment: str = "python_code_review_env"
+    task_count: int = Field(default=0, ge=0)

pytest-cache-files-1f62ra1g/container_sim/server/python_env_environment.py ADDED Viewed

	@@ -0,0 +1,9 @@

+"""Compatibility shim for older imports."""
+try:
+    from server.code_review_environment import PythonEnvironment
+except ModuleNotFoundError:  # pragma: no cover
+    from .code_review_environment import PythonEnvironment
+__all__ = ["PythonEnvironment"]

pytest-cache-files-1f62ra1g/container_sim/server/requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+openenv-core[core]>=0.2.2
+fastapi>=0.115.0
+uvicorn[standard]>=0.30.0
+openai>=1.40.0
+pytest>=8.0.0
+pydantic>=2.0.0

pytest-cache-files-1f62ra1g/container_sim/server/static_review.py ADDED Viewed

	@@ -0,0 +1,273 @@

+"""Deterministic static-review helpers for arbitrary Python code.
+Unlike the benchmark grader, this module does not compare against hidden rubric
+items. Instead, it performs direct AST-based review on arbitrary snippets so it
+can be used for manual testing, examples, and future dataset generation.
+"""
+from __future__ import annotations
+import ast
+from typing import List, Optional
+try:
+    from models import DirectReviewResponse, ReviewFinding
+except ModuleNotFoundError:  # pragma: no cover
+    from ..models import DirectReviewResponse, ReviewFinding
+class _StaticAnalyzer(ast.NodeVisitor):
+    """AST visitor that emits structured review findings.
+    The visitor intentionally focuses on a small set of high-signal patterns so
+    the direct-review endpoint stays predictable and easy to understand.
+    """
+    def __init__(self) -> None:
+        self.issues: List[ReviewFinding] = []
+    def visit_FunctionDef(self, node: ast.FunctionDef) -> None:  # noqa: N802
+        """Flag mutable default arguments in function definitions."""
+        for default in list(node.args.defaults):
+            if isinstance(default, (ast.List, ast.Dict, ast.Set)):
+                self.issues.append(
+                    ReviewFinding(
+                        title="Mutable default argument",
+                        line=getattr(default, "lineno", node.lineno),
+                        category="bug",
+                        severity="warning",
+                        rationale=(
+                            "Mutable defaults persist across calls and can leak state "
+                            "between unrelated requests."
+                        ),
+                        recommendation="Use None as the default and create the object inside the function.",
+                        rule_id="mutable-default-list",
+                    )
+                )
+        self.generic_visit(node)
+    def visit_Call(self, node: ast.Call) -> None:  # noqa: N802
+        """Inspect function calls for obviously unsafe or noisy patterns."""
+        func_name = self._call_name(node)
+        if func_name in {"eval", "exec"}:
+            self.issues.append(
+                ReviewFinding(
+                    title=f"Avoid {func_name} on untrusted input",
+                    line=node.lineno,
+                    category="security",
+                    severity="critical",
+                    rationale=(
+                        f"{func_name} executes arbitrary code and is unsafe on "
+                        "user-controlled input."
+                    ),
+                    recommendation="Use a safe parser or a whitelist-based evaluator.",
+                    rule_id="avoid-eval" if func_name == "eval" else "avoid-exec",
+                )
+            )
+        if func_name.endswith("check_output") or func_name.endswith("run"):
+            for keyword in node.keywords:
+                # `shell=True` is only a problem when the command comes from a
+                # shell-parsed string, but this heuristic is high value for
+                # review and intentionally conservative.
+                if keyword.arg == "shell" and isinstance(keyword.value, ast.Constant) and keyword.value.value is True:
+                    self.issues.append(
+                        ReviewFinding(
+                            title="shell=True with dynamic input",
+                            line=node.lineno,
+                            category="security",
+                            severity="critical",
+                            rationale=(
+                                "shell=True executes through the shell and can allow "
+                                "command injection when the command string is interpolated."
+                            ),
+                            recommendation="Pass a list of arguments and keep shell=False.",
+                            rule_id="shell-true-command-injection",
+                        )
+                    )
+        if func_name == "print":
+            self.issues.append(
+                ReviewFinding(
+                    title="Print statement in application logic",
+                    line=node.lineno,
+                    category="style",
+                    severity="info",
+                    rationale="Production services should prefer structured logging over print statements.",
+                    recommendation="Use the logging module or return the value to the caller.",
+                    rule_id="print-statement",
+                )
+            )
+        self.generic_visit(node)
+    def visit_ExceptHandler(self, node: ast.ExceptHandler) -> None:  # noqa: N802
+        """Flag bare exception handlers that hide failures."""
+        if node.type is None:
+            self.issues.append(
+                ReviewFinding(
+                    title="Bare except",
+                    line=node.lineno,
+                    category="maintainability",
+                    severity="warning",
+                    rationale="Bare except catches KeyboardInterrupt and other system-level exceptions.",
+                    recommendation="Catch a specific exception and record the failure.",
+                    rule_id="bare-except",
+                )
+            )
+        self.generic_visit(node)
+    def visit_For(self, node: ast.For) -> None:  # noqa: N802
+        """Look for list-membership checks nested in loops."""
+        for child in ast.walk(node):
+            if isinstance(child, ast.Compare) and any(
+                isinstance(operator, (ast.In, ast.NotIn)) for operator in child.ops
+            ):
+                if isinstance(child.comparators[0], ast.Name):
+                    self.issues.append(
+                        ReviewFinding(
+                            title="Potential quadratic membership check inside loop",
+                            line=child.lineno,
+                            category="performance",
+                            severity="warning",
+                            rationale=(
+                                "Repeated membership checks against a list inside a loop "
+                                "can degrade to quadratic runtime."
+                            ),
+                            recommendation="Use a set or dict for O(1) membership checks.",
+                            rule_id="quadratic-membership-check",
+                        )
+                    )
+                    break
+        self.generic_visit(node)
+    @staticmethod
+    def _call_name(node: ast.Call) -> str:
+        """Extract a dotted function name such as `subprocess.run`."""
+        func = node.func
+        if isinstance(func, ast.Name):
+            return func.id
+        if isinstance(func, ast.Attribute):
+            prefix = _StaticAnalyzer._attribute_prefix(func.value)
+            return f"{prefix}.{func.attr}" if prefix else func.attr
+        return ""
+    @staticmethod
+    def _attribute_prefix(node: ast.AST) -> str:
+        """Reconstruct the left-hand side of an attribute chain."""
+        if isinstance(node, ast.Name):
+            return node.id
+        if isinstance(node, ast.Attribute):
+            prefix = _StaticAnalyzer._attribute_prefix(node.value)
+            return f"{prefix}.{node.attr}" if prefix else node.attr
+        return ""
+def analyze_python_code(code: str) -> List[ReviewFinding]:
+    """Analyze arbitrary Python code and return structured findings."""
+    if not code.strip():
+        return [
+            ReviewFinding(
+                title="No code provided",
+                category="bug",
+                severity="warning",
+                rationale="The reviewer cannot inspect an empty submission.",
+                recommendation="Provide Python source code.",
+                rule_id="empty-input",
+            )
+        ]
+    # Syntax errors are turned into findings rather than exceptions so API
+    # consumers always get a valid response shape.
+    try:
+        tree = ast.parse(code)
+    except SyntaxError as exc:
+        return [
+            ReviewFinding(
+                title="Syntax error",
+                line=exc.lineno,
+                category="bug",
+                severity="critical",
+                rationale=exc.msg,
+                recommendation="Fix the syntax error before running static review.",
+                rule_id="syntax-error",
+            )
+        ]
+    analyzer = _StaticAnalyzer()
+    analyzer.visit(tree)
+    return _deduplicate(analyzer.issues)
+def build_direct_review_response(
+    code: str, context: Optional[str] = None
+) -> DirectReviewResponse:
+    """Build the public direct-review response for the `/review` route."""
+    issues = analyze_python_code(code)
+    weighted_penalty = 0.0
+    # The direct-review score is intentionally simple: more severe issues lower
+    # the score more aggressively.
+    for issue in issues:
+        if issue.severity == "critical":
+            weighted_penalty += 0.3
+        elif issue.severity == "warning":
+            weighted_penalty += 0.15
+        else:
+            weighted_penalty += 0.05
+    score = max(0.0, min(1.0, 1.0 - weighted_penalty))
+    summary = _build_summary(issues, context)
+    improved_code = _suggest_improved_code(code, issues)
+    return DirectReviewResponse(
+        issues=issues,
+        summary=summary,
+        score=score,
+        improved_code=improved_code,
+    )
+def _build_summary(issues: List[ReviewFinding], context: Optional[str]) -> str:
+    """Create a concise human-readable summary for the direct-review response."""
+    if not issues:
+        base = "No obvious issues were detected by the deterministic reviewer."
+    else:
+        critical = sum(1 for issue in issues if issue.severity == "critical")
+        warnings = sum(1 for issue in issues if issue.severity == "warning")
+        infos = sum(1 for issue in issues if issue.severity == "info")
+        base = (
+            f"Detected {len(issues)} issue(s): {critical} critical, "
+            f"{warnings} warning, {infos} info."
+        )
+    if context:
+        return f"{base} Context: {context}"
+    return base
+def _suggest_improved_code(code: str, issues: List[ReviewFinding]) -> Optional[str]:
+    """Append high-level fix directions to the submitted code."""
+    if not issues:
+        return None
+    suggestions = [issue.recommendation for issue in issues if issue.recommendation]
+    comment = " | ".join(dict.fromkeys(suggestions))
+    return f"{code.rstrip()}\n\n# Suggested review directions: {comment}"
+def _deduplicate(findings: List[ReviewFinding]) -> List[ReviewFinding]:
+    """Drop duplicate findings that refer to the same rule and line."""
+    seen = set()
+    unique: List[ReviewFinding] = []
+    for finding in findings:
+        key = (finding.rule_id, finding.line, finding.category)
+        if key in seen:
+            continue
+        seen.add(key)
+        unique.append(finding)
+    return unique

pytest-cache-files-1f62ra1g/container_sim/server/task_bank.py ADDED Viewed

	@@ -0,0 +1,340 @@

+"""Static PR-review tasks and hidden grading rubrics."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Dict, Iterable, List, Sequence
+try:
+    from models import Category, Difficulty, Severity, TaskDescriptor, TaskSummary
+except ModuleNotFoundError:  # pragma: no cover
+    from ..models import Category, Difficulty, Severity, TaskDescriptor, TaskSummary
+@dataclass(frozen=True)
+class RubricIssue:
+    """One hidden issue that can be matched by the deterministic grader."""
+    issue_id: str
+    file_path: str
+    line: int
+    category: Category
+    severity: Severity
+    keywords: Sequence[str]
+    min_keyword_hits: int
+    weight: float
+@dataclass(frozen=True)
+class TaskSpec:
+    """Complete task definition, including hidden rubric metadata."""
+    task_id: str
+    difficulty: Difficulty
+    title: str
+    goal: str
+    repo_summary: str
+    visible_diff: str
+    file_contents: Dict[str, str]
+    changed_files: Sequence[str]
+    rubric_issues: Sequence[RubricIssue]
+    max_steps: int
+    @property
+    def available_files(self) -> List[str]:
+        return list(self.file_contents.keys())
+    def to_descriptor(self) -> TaskDescriptor:
+        return TaskDescriptor(
+            task_id=self.task_id,
+            difficulty=self.difficulty,
+            title=self.title,
+            goal=self.goal,
+            repo_summary=self.repo_summary,
+            changed_files=list(self.changed_files),
+            available_files=self.available_files,
+            max_steps=self.max_steps,
+        )
+    def to_summary(self) -> TaskSummary:
+        return TaskSummary(
+            task_id=self.task_id,
+            difficulty=self.difficulty,
+            title=self.title,
+            goal=self.goal,
+        )
+TASKS: List[TaskSpec] = [
+    TaskSpec(
+        task_id="py-pr-review-easy",
+        difficulty="easy",
+        title="Retry Delay Regression",
+        goal=(
+            "Review the pull request and identify the real bug introduced in the retry "
+            "delay helper before it ships."
+        ),
+        repo_summary=(
+            "This service computes retry delays for background notification delivery. "
+            "The change is intended to relax validation for legacy callers."
+        ),
+        visible_diff="\n".join(
+            [
+                "diff --git a/src/notifications/retry.py b/src/notifications/retry.py",
+                "@@",
+                "-    if base_delay <= 0:",
+                "+    if base_delay < 0:",
+                "         return 0.0",
+            ]
+        ),
+        file_contents={
+            "src/notifications/retry.py": "\n".join(
+                [
+                    "from __future__ import annotations",
+                    "",
+                    "def calculate_retry_delay(attempt: int, base_delay: float = 2.0) -> float:",
+                    '    """Return the retry delay in seconds."""',
+                    "    if attempt < 0:",
+                    '        raise ValueError(\"attempt must be >= 0\")',
+                    "    if base_delay < 0:",
+                    "        return 0.0",
+                    "    return attempt / base_delay",
+                ]
+            )
+        },
+        changed_files=("src/notifications/retry.py",),
+        rubric_issues=(
+            RubricIssue(
+                issue_id="zero-base-delay-divides",
+                file_path="src/notifications/retry.py",
+                line=7,
+                category="bug",
+                severity="warning",
+                keywords=("zero", "division", "base_delay"),
+                min_keyword_hits=2,
+                weight=1.0,
+            ),
+        ),
+        max_steps=4,
+    ),
+    TaskSpec(
+        task_id="py-pr-review-medium",
+        difficulty="medium",
+        title="Coupon Billing Rollout",
+        goal=(
+            "Review the billing change and identify both the production regression and "
+            "the missing coverage that would have caught it."
+        ),
+        repo_summary=(
+            "The billing service is adding coupon support for one-off invoices. The PR "
+            "touches both the service code and its unit tests."
+        ),
+        visible_diff="\n".join(
+            [
+                "diff --git a/app/billing/invoice_service.py b/app/billing/invoice_service.py",
+                "@@",
+                " def charge_invoice(order: dict, gateway: Gateway) -> str:",
+                "-    return gateway.charge(order[\"customer_id\"], order[\"amount_cents\"])",
+                "+    total = order[\"amount_cents\"]",
+                "+    coupon = order.get(\"coupon_code\")",
+                "+    if coupon:",
+                "+        discount = gateway.lookup_discount(coupon)",
+                "+        total = max(total - discount, 0)",
+                "+    return gateway.charge(order[\"customer_id\"], order[\"amount_cents\"])",
+                "",
+                "diff --git a/tests/test_invoice_service.py b/tests/test_invoice_service.py",
+                "@@",
+                " class FakeGateway:",
+                "+    def lookup_discount(self, coupon: str) -> int:",
+                "+        return 250",
+            ]
+        ),
+        file_contents={
+            "app/billing/invoice_service.py": "\n".join(
+                [
+                    "from gateway import Gateway",
+                    "",
+                    "def charge_invoice(order: dict, gateway: Gateway) -> str:",
+                    '    total = order["amount_cents"]',
+                    '    coupon = order.get("coupon_code")',
+                    "    if coupon:",
+                    "        discount = gateway.lookup_discount(coupon)",
+                    "        total = max(total - discount, 0)",
+                    '    return gateway.charge(order["customer_id"], order["amount_cents"])',
+                ]
+            ),
+            "tests/test_invoice_service.py": "\n".join(
+                [
+                    "from app.billing.invoice_service import charge_invoice",
+                    "",
+                    "class FakeGateway:",
+                    "    def lookup_discount(self, coupon: str) -> int:",
+                    "        return 250",
+                    "",
+                    "    def charge(self, customer_id: str, amount_cents: int) -> str:",
+                    "        self.last_charge = (customer_id, amount_cents)",
+                    '        return "charge_123"',
+                    "",
+                    "def test_charge_invoice_without_coupon():",
+                    "    gateway = FakeGateway()",
+                    '    charge_invoice({"customer_id": "cus_1", "amount_cents": 1000}, gateway)',
+                    '    assert gateway.last_charge == ("cus_1", 1000)',
+                ]
+            ),
+        },
+        changed_files=("app/billing/invoice_service.py", "tests/test_invoice_service.py"),
+        rubric_issues=(
+            RubricIssue(
+                issue_id="discount-total-unused",
+                file_path="app/billing/invoice_service.py",
+                line=8,
+                category="bug",
+                severity="warning",
+                keywords=("discount", "total", "charge", "amount"),
+                min_keyword_hits=2,
+                weight=0.6,
+            ),
+            RubricIssue(
+                issue_id="missing-coupon-test",
+                file_path="tests/test_invoice_service.py",
+                line=11,
+                category="testing",
+                severity="warning",
+                keywords=("missing", "test", "coupon", "discount"),
+                min_keyword_hits=2,
+                weight=0.4,
+            ),
+        ),
+        max_steps=5,
+    ),
+    TaskSpec(
+        task_id="py-pr-review-hard",
+        difficulty="hard",
+        title="Async Job Runner Deduplication",
+        goal=(
+            "Review the async job-runner PR and find the subtle concurrency issues "
+            "without inventing extra problems."
+        ),
+        repo_summary=(
+            "A shared webhook backfill service is deduplicating in-flight work with an "
+            "async task cache and writing the latest result for operators to inspect."
+        ),
+        visible_diff="\n".join(
+            [
+                "diff --git a/app/jobs/runner.py b/app/jobs/runner.py",
+                "@@",
+                " async def run_job(job_id: str, payload: dict, worker) -> str:",
+                "     if job_id in ACTIVE_RUNS:",
+                "         return await ACTIVE_RUNS[job_id]",
+                "+    lock = asyncio.Lock()",
+                "+    async with lock:",
+                "+        task = asyncio.create_task(worker.run(payload))",
+                "+        ACTIVE_RUNS[job_id] = task",
+                "     try:",
+                "         result = await task",
+                "     finally:",
+                "         ACTIVE_RUNS.pop(job_id, None)",
+                "+    Path(\"latest-result.json\").write_text(result)",
+                "     return result",
+            ]
+        ),
+        file_contents={
+            "app/jobs/runner.py": "\n".join(
+                [
+                    "import asyncio",
+                    "from pathlib import Path",
+                    "",
+                    "ACTIVE_RUNS: dict[str, asyncio.Task[str]] = {}",
+                    "",
+                    "async def run_job(job_id: str, payload: dict, worker) -> str:",
+                    "    if job_id in ACTIVE_RUNS:",
+                    "        return await ACTIVE_RUNS[job_id]",
+                    "",
+                    "    lock = asyncio.Lock()",
+                    "    async with lock:",
+                    "        task = asyncio.create_task(worker.run(payload))",
+                    "        ACTIVE_RUNS[job_id] = task",
+                    "    try:",
+                    "        result = await task",
+                    "    finally:",
+                    "        ACTIVE_RUNS.pop(job_id, None)",
+                    "",
+                    '    Path("latest-result.json").write_text(result)',
+                    "    return result",
+                ]
+            ),
+            "tests/test_runner.py": "\n".join(
+                [
+                    "import pytest",
+                    "",
+                    "from app.jobs.runner import run_job",
+                    "",
+                    "class FakeWorker:",
+                    "    async def run(self, payload: dict) -> str:",
+                    '        return payload["job_id"]',
+                    "",
+                    "@pytest.mark.asyncio",
+                    "async def test_run_job_returns_worker_result():",
+                    "    worker = FakeWorker()",
+                    '    result = await run_job("job-1", {"job_id": "job-1"}, worker)',
+                    '    assert result == "job-1"',
+                ]
+            ),
+        },
+        changed_files=("app/jobs/runner.py", "tests/test_runner.py"),
+        rubric_issues=(
+            RubricIssue(
+                issue_id="per-call-lock-race",
+                file_path="app/jobs/runner.py",
+                line=9,
+                category="bug",
+                severity="warning",
+                keywords=("lock", "race", "concurrent", "duplicate"),
+                min_keyword_hits=2,
+                weight=0.55,
+            ),
+            RubricIssue(
+                issue_id="shared-output-file-race",
+                file_path="app/jobs/runner.py",
+                line=18,
+                category="maintainability",
+                severity="warning",
+                keywords=("latest", "result", "file", "concurrent", "overwrite"),
+                min_keyword_hits=2,
+                weight=0.45,
+            ),
+        ),
+        max_steps=6,
+    ),
+]
+TASKS_BY_ID: Dict[str, TaskSpec] = {task.task_id: task for task in TASKS}
+def list_task_descriptors() -> List[TaskDescriptor]:
+    """Return public descriptors for all tasks."""
+    return [task.to_descriptor() for task in TASKS]
+def list_task_summaries() -> List[TaskSummary]:
+    """Return task summaries for lightweight route responses."""
+    return [task.to_summary() for task in TASKS]
+def get_task(task_id: str) -> TaskSpec:
+    """Return a task by id."""
+    try:
+        return TASKS_BY_ID[task_id]
+    except KeyError as exc:  # pragma: no cover
+        raise ValueError(f"Unknown task_id: {task_id}") from exc
+def task_ids() -> Iterable[str]:
+    """Return task ids in benchmark order."""
+    return [task.task_id for task in TASKS]

pytest-cache-files-1f62ra1g/container_sim/server/tasks/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+"""Self-contained task definitions for container builds."""
+from .task_bank import TaskSpec, get_task, list_task_descriptors, list_task_summaries, task_ids
+__all__ = [
+    "TaskSpec",
+    "get_task",
+    "list_task_descriptors",
+    "list_task_summaries",
+    "task_ids",
+]

pytest-cache-files-1f62ra1g/container_sim/server/tasks/task_bank.py ADDED Viewed

	@@ -0,0 +1,213 @@

+"""Deterministic task bank for self-contained server builds."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional
+from ..models import Difficulty, TaskDescriptor, TaskKind
+@dataclass(frozen=True)
+class TaskSpec:
+    task_id: str
+    title: str
+    difficulty: Difficulty
+    task_kind: TaskKind
+    task_description: str
+    starter_code: str
+    reference_code: str
+    visible_tests: List[str]
+    hidden_tests: List[str]
+    max_steps: int = 10
+    benchmark_entrypoint: Optional[str] = None
+    benchmark_builder: Optional[str] = None
+    benchmark_repeats: int = 1
+    benchmark_timeout_s: float = 2.0
+    style_max_line_length: int = 88
+    expected_quality_markers: List[str] = field(default_factory=list)
+    def to_descriptor(self) -> TaskDescriptor:
+        return TaskDescriptor(
+            task_id=self.task_id,
+            title=self.title,
+            difficulty=self.difficulty,
+            task_kind=self.task_kind,
+            task_description=self.task_description,
+            starter_code=self.starter_code,
+            visible_tests=list(self.visible_tests),
+            max_steps=self.max_steps,
+        )
+TASK_SYNTAX_FIX = TaskSpec(
+    task_id="syntax-fix-easy",
+    title="Fix a syntax-broken username normalizer",
+    difficulty="easy",
+    task_kind="syntax_fix",
+    task_description=(
+        "You are reviewing a utility function before merge. The submitted patch left "
+        "the function with syntax errors. Repair the code so it compiles and preserves "
+        "the intended behavior of trimming, lowercasing, and replacing spaces with underscores."
+    ),
+    starter_code='''def normalize_username(raw_name: str) -> str:
+    cleaned = raw_name.strip().lower(
+    if not cleaned:
+        return "anonymous"
+    return cleaned.replace(" ", "_")
+''',
+    reference_code='''def normalize_username(raw_name: str) -> str:
+    cleaned = raw_name.strip().lower()
+    if not cleaned:
+        return "anonymous"
+    return cleaned.replace(" ", "_")
+''',
+    visible_tests=[
+        "normalize_username('  Alice Smith  ') == 'alice_smith'",
+        "normalize_username('   ') == 'anonymous'",
+        "normalize_username('Bob') == 'bob'",
+    ],
+    hidden_tests=[
+        "normalize_username('  HELLO WORLD  ') == 'hello_world'",
+        "normalize_username('') == 'anonymous'",
+    ],
+    max_steps=8,
+)
+TASK_BUG_FIX = TaskSpec(
+    task_id="bug-fix-medium",
+    title="Repair invoice discount calculation logic",
+    difficulty="medium",
+    task_kind="bug_fix",
+    task_description=(
+        "A billing helper function is returning the wrong amount after applying discounts. "
+        "The function signature is correct, but the calculation logic is broken. "
+        "Inspect the implementation, run visible tests, and fix the bug so all tests pass. "
+        "Do not change the function signature or validation logic."
+    ),
+    starter_code='''from typing import Iterable
+def calculate_invoice_total(line_items: Iterable[int], discount_percent: int) -> int:
+    """Calculate invoice total with discount applied."""
+    if discount_percent < 0 or discount_percent > 100:
+        raise ValueError("discount_percent must be between 0 and 100")
+    subtotal = sum(line_items)
+    discounted_total = subtotal - (subtotal * discount_percent // 100)
+    return subtotal
+''',
+    reference_code='''from typing import Iterable
+def calculate_invoice_total(line_items: Iterable[int], discount_percent: int) -> int:
+    """Calculate invoice total with discount applied."""
+    if discount_percent < 0 or discount_percent > 100:
+        raise ValueError("discount_percent must be between 0 and 100")
+    subtotal = sum(line_items)
+    discounted_total = subtotal - (subtotal * discount_percent // 100)
+    return discounted_total
+''',
+    visible_tests=[
+        "calculate_invoice_total([1000, 2000], 0) == 3000",
+        "calculate_invoice_total([1000, 2000], 50) == 1500",
+        "calculate_invoice_total([1000], 10) == 900",
+        "calculate_invoice_total([], 0) == 0",
+    ],
+    hidden_tests=[
+        "calculate_invoice_total([100, 200, 300], 25) == 450",
+        "calculate_invoice_total([5000], 99) == 50",
+    ],
+    max_steps=10,
+)
+TASK_OPTIMIZATION = TaskSpec(
+    task_id="optimization-hard",
+    title="Optimize inefficient user activity summarization",
+    difficulty="hard",
+    task_kind="optimization",
+    task_description=(
+        "Code review found that `summarize_user_activity` is inefficient for large event streams. "
+        "The current implementation repeatedly scans the full event list for every user, making it O(n**2). "
+        "Refactor it to aggregate counts in one pass while preserving the sorted output contract. "
+        "Style and code quality also matter: use idiomatic Python, proper types, and clear logic. "
+        "All tests must pass, and the optimized version should be measurably faster."
+    ),
+    starter_code='''from typing import Iterable
+def summarize_user_activity(events: Iterable[dict]) -> list[tuple[str, int]]:
+    """Aggregate user activity counts."""
+    ordered_users = []
+    for event in events:
+        user_id = event["user_id"]
+        if user_id not in ordered_users:
+            ordered_users.append(user_id)
+    summary = []
+    for user_id in ordered_users:
+        count = 0
+        for event in events:
+            if event["user_id"] == user_id:
+                count += 1
+        summary.append((user_id, count))
+    return sorted(summary, key=lambda item: (-item[1], item[0]))
+''',
+    reference_code='''from collections import Counter
+from typing import Iterable
+def summarize_user_activity(events: Iterable[dict]) -> list[tuple[str, int]]:
+    """Aggregate user activity counts in one pass."""
+    counts = Counter(event["user_id"] for event in events)
+    return sorted(counts.items(), key=lambda item: (-item[1], item[0]))
+''',
+    visible_tests=[
+        "summarize_user_activity([{'user_id': 'alice'}, {'user_id': 'bob'}, {'user_id': 'alice'}]) == [('alice', 2), ('bob', 1)]",
+        "summarize_user_activity([{'user_id': 'z'}, {'user_id': 'a'}]) == [('a', 1), ('z', 1)]",
+        "summarize_user_activity([]) == []",
+        "summarize_user_activity([{'user_id': 'solo'}]) == [('solo', 1)]",
+    ],
+    hidden_tests=[
+        "summarize_user_activity([{'user_id': 'u2'}, {'user_id': 'u1'}, {'user_id': 'u2'}, {'user_id': 'u2'}, {'user_id': 'u1'}]) == [('u2', 3), ('u1', 2)]",
+    ],
+    max_steps=10,
+    benchmark_entrypoint="summarize_user_activity",
+    benchmark_builder='''def build_benchmark_events():
+    return [{"user_id": f"user_{index % 400}"} for index in range(6000)]''',
+    benchmark_repeats=3,
+    benchmark_timeout_s=1.0,
+    style_max_line_length=88,
+    expected_quality_markers=["Counter", "sorted"],
+)
+TASKS: Dict[str, TaskSpec] = {
+    "syntax-fix-easy": TASK_SYNTAX_FIX,
+    "bug-fix-medium": TASK_BUG_FIX,
+    "optimization-hard": TASK_OPTIMIZATION,
+}
+def task_ids() -> List[str]:
+    return ["syntax-fix-easy", "bug-fix-medium", "optimization-hard"]
+def get_task(task_id: str) -> TaskSpec:
+    if task_id not in TASKS:
+        raise ValueError(f"Task {task_id} not found. Available: {list(TASKS.keys())}")
+    return TASKS[task_id]
+def list_task_descriptors() -> List[TaskDescriptor]:
+    return [get_task(tid).to_descriptor() for tid in task_ids()]
+def list_task_summaries() -> List[TaskDescriptor]:
+    return list_task_descriptors()

server/app.py CHANGED Viewed

@@ -7,17 +7,27 @@ import os
 from fastapi import APIRouter, HTTPException
 from fastapi.responses import RedirectResponse
-from compat import create_app
-from models import (
-    HealthResponse,
-    PythonCodeReviewAction,
-    PythonCodeReviewObservation,
-    PythonCodeReviewState,
-    TaskDescriptor,
-    TaskGrade,
-)
-from server.env import PythonCodeReviewEnvironment
 try:

 from fastapi import APIRouter, HTTPException
 from fastapi.responses import RedirectResponse
+try:
+    from compat import create_app
+    from models import (
+        HealthResponse,
+        PythonCodeReviewAction,
+        PythonCodeReviewObservation,
+        PythonCodeReviewState,
+        TaskDescriptor,
+        TaskGrade,
+    )
+except Exception:
+    from .compat import create_app
+    from .models import (
+        HealthResponse,
+        PythonCodeReviewAction,
+        PythonCodeReviewObservation,
+        PythonCodeReviewState,
+        TaskDescriptor,
+        TaskGrade,
+    )
+from server.env import PythonCodeReviewEnvironment
 try:

server/compat.py ADDED Viewed

	@@ -0,0 +1,89 @@

+"""Compatibility helpers for OpenEnv and FastMCP runtime drift."""
+from __future__ import annotations
+import sys
+import types
+from typing import Any
+def install_openenv_fastmcp_compat() -> None:
+    """Patch FastMCP API differences so older OpenEnv builds keep importing."""
+    try:
+        import fastmcp  # type: ignore
+    except Exception:
+        return
+    try:
+        if not hasattr(fastmcp, "Client"):
+            class CompatClient:
+                """Minimal async MCP client used for legacy OpenEnv imports."""
+                def __init__(self, *args: Any, **kwargs: Any) -> None:
+                    self.args = args
+                    self.kwargs = kwargs
+                async def __aenter__(self) -> "CompatClient":
+                    return self
+                async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> bool:
+                    return False
+                async def list_tools(self) -> list[Any]:
+                    return []
+                async def call_tool(self, tool_name: str, arguments: dict[str, Any]) -> Any:
+                    raise RuntimeError(
+                        f"MCP client compatibility mode cannot call tool: {tool_name}"
+                    )
+            fastmcp.Client = CompatClient  # type: ignore[attr-defined]
+    except Exception:
+        pass
+    try:
+        client_pkg = sys.modules.get("fastmcp.client")
+        if client_pkg is None:
+            client_pkg = types.ModuleType("fastmcp.client")
+            sys.modules["fastmcp.client"] = client_pkg
+        client_mod = sys.modules.get("fastmcp.client.client")
+        if client_mod is None:
+            client_mod = types.ModuleType("fastmcp.client.client")
+            sys.modules["fastmcp.client.client"] = client_mod
+        if not hasattr(client_mod, "CallToolResult"):
+            class CallToolResult:
+                """Compatibility container for legacy OpenEnv response handling."""
+                def __init__(
+                    self,
+                    content: Any = None,
+                    structured_content: Any = None,
+                    meta: Any = None,
+                    data: Any = None,
+                    is_error: bool = False,
+                ) -> None:
+                    self.content = content
+                    self.structured_content = structured_content
+                    self.meta = meta
+                    self.data = data
+                    self.is_error = is_error
+            client_mod.CallToolResult = CallToolResult
+        client_pkg.client = client_mod  # type: ignore[attr-defined]
+    except Exception:
+        pass
+install_openenv_fastmcp_compat()
+from openenv.core.env_server.http_server import create_app as openenv_create_app
+from openenv.core.env_server.interfaces import Environment
+from openenv.core.env_server.types import Action, Observation, State
+create_app = openenv_create_app

server/env_safe.py CHANGED Viewed

@@ -5,18 +5,32 @@ from __future__ import annotations
 from typing import Any, Optional
 from uuid import uuid4
-from compat import Environment
-from graders import grade_task
-from models import (
-    HealthResponse,
-    HistoryEntry,
-    PythonCodeReviewAction,
-    PythonCodeReviewObservation,
-    PythonCodeReviewState,
-    RewardDetails,
-    TaskGrade,
-)
-from tasks import TaskSpec, get_task as load_task, list_task_summaries, task_ids
 INVALID_ACTION_PENALTY = 0.10
@@ -489,4 +503,3 @@ class PythonCodeReviewEnvironment(
 PythonEnvironment = PythonCodeReviewEnvironment
 CodeReviewEnvironment = PythonCodeReviewEnvironment

 from typing import Any, Optional
 from uuid import uuid4
+try:
+    from compat import Environment
+    from graders import grade_task
+    from models import (
+        HealthResponse,
+        HistoryEntry,
+        PythonCodeReviewAction,
+        PythonCodeReviewObservation,
+        PythonCodeReviewState,
+        RewardDetails,
+        TaskGrade,
+    )
+    from tasks import TaskSpec, get_task as load_task, list_task_summaries, task_ids
+except Exception:
+    from .compat import Environment
+    from .graders import grade_task
+    from .models import (
+        HealthResponse,
+        HistoryEntry,
+        PythonCodeReviewAction,
+        PythonCodeReviewObservation,
+        PythonCodeReviewState,
+        RewardDetails,
+        TaskGrade,
+    )
+    from .tasks import TaskSpec, get_task as load_task, list_task_summaries, task_ids
 INVALID_ACTION_PENALTY = 0.10
 PythonEnvironment = PythonCodeReviewEnvironment
 CodeReviewEnvironment = PythonCodeReviewEnvironment

server/graders/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+"""Deterministic graders for self-contained server builds."""
+from .common import clamp_score
+from .optimization import grade_optimization_task
+from .pytest_runner import PytestExecution, run_pytest_suite
+from .syntax import grade_bug_fix_task, grade_syntax_task, grade_task
+__all__ = [
+    "PytestExecution",
+    "clamp_score",
+    "grade_bug_fix_task",
+    "grade_optimization_task",
+    "grade_syntax_task",
+    "grade_task",
+    "run_pytest_suite",
+]

server/graders/common.py ADDED Viewed

	@@ -0,0 +1,69 @@

+"""Shared deterministic scoring helpers."""
+from __future__ import annotations
+import ast
+import difflib
+import traceback
+from typing import Tuple
+def clamp_score(value: float) -> float:
+    return max(0.0, min(1.0, round(value, 6)))
+def syntax_error_message(code: str) -> str:
+    try:
+        ast.parse(code)
+    except SyntaxError as exc:
+        return f"{exc.msg} (line {exc.lineno}, column {exc.offset})"
+    except Exception:
+        return traceback.format_exc(limit=1).strip()
+    return ""
+def compiles(code: str) -> bool:
+    try:
+        compile(code, "<candidate>", "exec")
+    except Exception:
+        return False
+    return True
+def normalized_diff_score(code: str, reference_code: str) -> float:
+    ratio = difflib.SequenceMatcher(
+        a="".join(code.split()),
+        b="".join(reference_code.split()),
+    ).ratio()
+    return clamp_score(ratio)
+def style_score(code: str, max_line_length: int = 88) -> float:
+    lines = code.splitlines() or [""]
+    line_length_ok = sum(1 for line in lines if len(line) <= max_line_length) / len(lines)
+    tab_ok = 1.0 if all("\t" not in line for line in lines) else 0.0
+    trailing_ws_ok = 1.0 if all(line == line.rstrip() for line in lines) else 0.0
+    return clamp_score((line_length_ok * 0.6) + (tab_ok * 0.2) + (trailing_ws_ok * 0.2))
+def nested_loop_depth(tree: ast.AST) -> int:
+    best = 0
+    def walk(node: ast.AST, depth: int) -> None:
+        nonlocal best
+        if isinstance(node, (ast.For, ast.AsyncFor, ast.While)):
+            depth += 1
+            best = max(best, depth)
+        for child in ast.iter_child_nodes(node):
+            walk(child, depth)
+    walk(tree, 0)
+    return best
+def compile_tree(code: str) -> Tuple[ast.AST | None, str]:
+    try:
+        return ast.parse(code), ""
+    except SyntaxError as exc:
+        return None, f"{exc.msg} (line {exc.lineno}, column {exc.offset})"

server/graders/optimization.py ADDED Viewed

	@@ -0,0 +1,135 @@

+"""Deterministic grading for optimization tasks."""
+from __future__ import annotations
+import json
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+from .common import clamp_score, compile_tree, nested_loop_depth, style_score
+from .pytest_runner import run_pytest_suite
+from ..models import TaskGrade
+from ..tasks.task_bank import TaskSpec
+def _benchmark_script(task: TaskSpec) -> str:
+    return f"""import json
+import time
+from candidate import {task.benchmark_entrypoint}
+{task.benchmark_builder}
+events = build_benchmark_events()
+start = time.perf_counter()
+for _ in range({task.benchmark_repeats}):
+    result = {task.benchmark_entrypoint}(events)
+elapsed = time.perf_counter() - start
+Path = __import__("pathlib").Path
+Path("benchmark.json").write_text(json.dumps({{"elapsed": elapsed, "rows": len(result)}}), encoding="utf-8")
+"""
+def benchmark_runtime(candidate_code: str, task: TaskSpec) -> tuple[float, bool, str]:
+    assert task.benchmark_entrypoint is not None
+    try:
+        with tempfile.TemporaryDirectory(prefix="python-code-review-bench-") as temp_dir:
+            temp_path = Path(temp_dir)
+            (temp_path / "candidate.py").write_text(candidate_code, encoding="utf-8")
+            (temp_path / "starter.py").write_text(task.starter_code, encoding="utf-8")
+            (temp_path / "candidate_runner.py").write_text(_benchmark_script(task), encoding="utf-8")
+            starter_script = _benchmark_script(task).replace("from candidate import", "from starter import")
+            (temp_path / "starter_runner.py").write_text(starter_script, encoding="utf-8")
+            try:
+                starter_run = subprocess.run(
+                    [sys.executable, "starter_runner.py"],
+                    cwd=temp_path,
+                    capture_output=True,
+                    text=True,
+                    timeout=task.benchmark_timeout_s,
+                    check=False,
+                )
+                starter_payload = json.loads((temp_path / "benchmark.json").read_text(encoding="utf-8"))
+                candidate_run = subprocess.run(
+                    [sys.executable, "candidate_runner.py"],
+                    cwd=temp_path,
+                    capture_output=True,
+                    text=True,
+                    timeout=task.benchmark_timeout_s,
+                    check=False,
+                )
+                candidate_payload = json.loads((temp_path / "benchmark.json").read_text(encoding="utf-8"))
+            except subprocess.TimeoutExpired as exc:
+                output = (exc.stdout or "") + (exc.stderr or "")
+                return 0.0, True, (output or "benchmark timed out").strip()
+            except Exception as exc:
+                return 0.0, False, str(exc)
+            starter_elapsed = max(float(starter_payload["elapsed"]), 1e-9)
+            candidate_elapsed = max(float(candidate_payload["elapsed"]), 1e-9)
+            speedup = starter_elapsed / candidate_elapsed
+            runtime_score = clamp_score(min((speedup - 1.0) / 3.0, 1.0))
+            output = "\n".join(
+                part
+                for part in [
+                    starter_run.stdout.strip(),
+                    starter_run.stderr.strip(),
+                    candidate_run.stdout.strip(),
+                    candidate_run.stderr.strip(),
+                    f"starter={starter_elapsed:.6f}s candidate={candidate_elapsed:.6f}s speedup={speedup:.2f}x",
+                ]
+                if part
+            )
+            return runtime_score, False, output
+    except Exception as exc:
+        return 0.0, False, str(exc)
+def ast_quality_score(code: str, task: TaskSpec) -> float:
+    tree, _ = compile_tree(code)
+    if tree is None:
+        return 0.0
+    import ast
+    function_node = next((node for node in tree.body if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef))), None)
+    docstring_points = 0.2 if function_node and ast.get_docstring(function_node, clean=False) else 0.0
+    nested_points = 0.4 if nested_loop_depth(tree) <= 1 else 0.0
+    marker_points = 0.0
+    for marker in task.expected_quality_markers:
+        if marker in code:
+            marker_points += 0.2
+    return clamp_score(docstring_points + nested_points + marker_points)
+def grade_optimization_task(candidate_code: str, task: TaskSpec) -> TaskGrade:
+    execution = run_pytest_suite(candidate_code, [*task.visible_tests, *task.hidden_tests], timeout_s=task.benchmark_timeout_s)
+    test_fraction = execution.passed / execution.total if execution.total else 0.0
+    if execution.timed_out:
+        return TaskGrade(score=0.0, tests_passed=execution.passed, tests_total=execution.total, timed_out=True, details={"tests": execution.output})
+    runtime_score, timed_out, benchmark_output = benchmark_runtime(candidate_code, task)
+    if timed_out:
+        return TaskGrade(score=0.0, tests_passed=execution.passed, tests_total=execution.total, timed_out=True, details={"tests": execution.output, "benchmark": benchmark_output})
+    quality_score = ast_quality_score(candidate_code, task)
+    pep8_score = style_score(candidate_code, task.style_max_line_length)
+    score = clamp_score((0.5 * test_fraction) + (0.3 * runtime_score) + (0.15 * quality_score) + (0.05 * pep8_score))
+    return TaskGrade(
+        score=score,
+        syntax_score=1.0,
+        tests_passed=execution.passed,
+        tests_total=execution.total,
+        quality_score=quality_score,
+        runtime_score=runtime_score,
+        details={
+            "tests": execution.output,
+            "benchmark": benchmark_output,
+            "test_fraction": round(test_fraction, 4),
+            "runtime_score": round(runtime_score, 4),
+            "style_score": round(pep8_score, 4),
+        },
+    )

server/graders/pytest_runner.py ADDED Viewed

	@@ -0,0 +1,121 @@

+"""Helpers for deterministic pytest execution in temp sandboxes."""
+from __future__ import annotations
+import json
+import subprocess
+import sys
+import tempfile
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Iterable
+@dataclass(frozen=True)
+class PytestExecution:
+    passed: int
+    failed: int
+    total: int
+    timed_out: bool
+    output: str
+def _test_module_source(tests: Iterable[str]) -> str:
+    blocks: list[str] = ["from candidate import *  # noqa: F401,F403"]
+    for index, test in enumerate(tests, start=1):
+        snippet = str(test).strip()
+        if not snippet:
+            continue
+        if snippet.startswith("def test_"):
+            blocks.append(snippet)
+            continue
+        blocks.append(
+            "\n".join(
+                [
+                    f"def test_case_{index:03d}():",
+                    f"    assert {snippet}",
+                ]
+            )
+        )
+    return "\n\n".join(blocks) or "def test_placeholder():\n    assert True\n"
+def _runner_script() -> str:
+    return """import json
+import pathlib
+import pytest
+class Collector:
+    def __init__(self) -> None:
+        self.passed = 0
+        self.failed = 0
+    def pytest_runtest_logreport(self, report):
+        if report.when != "call":
+            return
+        if report.passed:
+            self.passed += 1
+        elif report.failed:
+            self.failed += 1
+collector = Collector()
+exit_code = pytest.main(["-q", "test_candidate.py"], plugins=[collector])
+payload = {
+    "passed": collector.passed,
+    "failed": collector.failed,
+    "exit_code": int(exit_code),
+}
+pathlib.Path("pytest_results.json").write_text(json.dumps(payload), encoding="utf-8")
+"""
+def run_pytest_suite(candidate_code: str, tests: Iterable[str], timeout_s: float = 3.0) -> PytestExecution:
+    test_cases = list(tests)
+    try:
+        with tempfile.TemporaryDirectory(prefix="python-code-review-") as temp_dir:
+            temp_path = Path(temp_dir)
+            (temp_path / "candidate.py").write_text(candidate_code, encoding="utf-8")
+            (temp_path / "test_candidate.py").write_text(_test_module_source(test_cases), encoding="utf-8")
+            (temp_path / "runner.py").write_text(_runner_script(), encoding="utf-8")
+            try:
+                completed = subprocess.run(
+                    [sys.executable, "runner.py"],
+                    cwd=temp_path,
+                    capture_output=True,
+                    text=True,
+                    timeout=timeout_s,
+                    check=False,
+                )
+            except subprocess.TimeoutExpired as exc:
+                output = (exc.stdout or "") + (exc.stderr or "")
+                return PytestExecution(
+                    passed=0,
+                    failed=max(len(test_cases), 1),
+                    total=max(len(test_cases), 1),
+                    timed_out=True,
+                    output=(output or "pytest timed out").strip(),
+                )
+            result_path = temp_path / "pytest_results.json"
+            if not result_path.exists():
+                output = (completed.stdout or "") + (completed.stderr or "")
+                total = max(len(test_cases), 1)
+                return PytestExecution(0, total, total, False, output.strip())
+            try:
+                payload = json.loads(result_path.read_text(encoding="utf-8"))
+            except Exception as exc:
+                output = ((completed.stdout or "") + (completed.stderr or "")).strip()
+                return PytestExecution(0, max(len(test_cases), 1), max(len(test_cases), 1), False, (output or str(exc)).strip())
+            passed = int(payload.get("passed", 0))
+            failed = int(payload.get("failed", 0))
+            total = max(passed + failed, len(test_cases))
+            output = ((completed.stdout or "") + (completed.stderr or "")).strip()
+            return PytestExecution(passed, failed, total, False, output)
+    except Exception as exc:
+        return PytestExecution(0, max(len(test_cases), 1), max(len(test_cases), 1), False, str(exc))

server/graders/syntax.py ADDED Viewed

	@@ -0,0 +1,60 @@

+"""Task graders for syntax and bug-fix tasks."""
+from __future__ import annotations
+from .common import clamp_score, compiles, normalized_diff_score, style_score, syntax_error_message
+from .optimization import grade_optimization_task
+from .pytest_runner import run_pytest_suite
+from ..models import TaskGrade
+from ..tasks.task_bank import TaskSpec
+def grade_syntax_task(candidate_code: str, task: TaskSpec) -> TaskGrade:
+    error = syntax_error_message(candidate_code)
+    diff_score = normalized_diff_score(candidate_code, task.reference_code)
+    style_base = style_score(candidate_code, task.style_max_line_length)
+    if not error:
+        return TaskGrade(score=1.0, syntax_score=1.0, quality_score=style_base, details={"compile_error": ""})
+    partial = clamp_score(0.15 + (0.55 * diff_score))
+    return TaskGrade(score=partial, syntax_score=0.0, quality_score=diff_score * style_base, details={"compile_error": error})
+def grade_bug_fix_task(candidate_code: str, task: TaskSpec, include_hidden: bool = True) -> TaskGrade:
+    if not compiles(candidate_code):
+        error = syntax_error_message(candidate_code)
+        return TaskGrade(score=0.0, syntax_score=0.0, details={"compile_error": error})
+    tests = list(task.visible_tests)
+    if include_hidden:
+        tests.extend(task.hidden_tests)
+    execution = run_pytest_suite(candidate_code, tests, timeout_s=3.0)
+    if execution.timed_out:
+        return TaskGrade(
+            score=0.0,
+            syntax_score=1.0,
+            tests_passed=execution.passed,
+            tests_total=execution.total,
+            timed_out=True,
+            details={"compile_error": "", "tests": execution.output},
+        )
+    pass_fraction = execution.passed / execution.total if execution.total else 0.0
+    quality = style_score(candidate_code, task.style_max_line_length)
+    return TaskGrade(
+        score=clamp_score(pass_fraction),
+        syntax_score=1.0,
+        tests_passed=execution.passed,
+        tests_total=execution.total,
+        quality_score=quality,
+        details={"compile_error": "", "tests": execution.output},
+    )
+def grade_task(candidate_code: str, task: TaskSpec, include_hidden: bool = True) -> TaskGrade:
+    if task.task_kind == "syntax_fix":
+        return grade_syntax_task(candidate_code, task)
+    if task.task_kind == "bug_fix":
+        return grade_bug_fix_task(candidate_code, task, include_hidden=include_hidden)
+    return grade_optimization_task(candidate_code, task)

server/models.py ADDED Viewed

	@@ -0,0 +1,149 @@

+"""Typed models for the self-contained server package."""
+from __future__ import annotations
+from typing import Any, Dict, List, Literal, Optional
+from pydantic import BaseModel, Field
+from .compat import Action, Observation, State
+Difficulty = Literal["easy", "medium", "hard"]
+TaskKind = Literal["syntax_fix", "bug_fix", "optimization"]
+ActionType = Literal["analyze_code", "edit_code", "run_tests", "submit_solution"]
+Category = Literal["bug", "security", "performance", "maintainability", "style", "testing"]
+Severity = Literal["critical", "warning", "info"]
+class HistoryEntry(BaseModel):
+    step: int = Field(..., ge=0)
+    action_type: ActionType
+    status: str
+    reward: float
+class RewardDetails(BaseModel):
+    value: float
+    syntax_reward: float = 0.0
+    test_reward: float = 0.0
+    quality_bonus: float = 0.0
+    correctness_bonus: float = 0.0
+    progress_delta: float = 0.0
+    stagnation_penalty: float = 0.0
+    regression_penalty: float = 0.0
+    invalid_action_penalty: float = 0.0
+    timeout_penalty: float = 0.0
+    reason: str
+    prev_score: float = 0.0
+    curr_score: float = 0.0
+    code_changed: bool = False
+class PythonCodeReviewAction(Action):
+    action_type: ActionType
+    code: Optional[str] = None
+class PythonCodeReviewObservation(Observation):
+    task_id: str
+    title: str = ""
+    difficulty: Difficulty
+    task_kind: Optional[TaskKind] = None
+    task_description: str
+    current_code: str
+    errors: str
+    test_results: str
+    visible_tests: List[str] = Field(default_factory=list)
+    history: List[HistoryEntry] = Field(default_factory=list)
+    attempts_remaining: int = Field(..., ge=0)
+    last_action_status: str = ""
+    score: float = Field(..., ge=0.0, le=1.0)
+    reward_details: RewardDetails = Field(
+        default_factory=lambda: RewardDetails(value=0.0, reason="Reset")
+    )
+class PythonCodeReviewState(State):
+    episode_id: str
+    step_count: int = Field(default=0, ge=0)
+    task_id: Optional[str] = None
+    difficulty: Optional[Difficulty] = None
+    task_kind: Optional[TaskKind] = None
+    attempts_remaining: int = Field(default=0, ge=0)
+    current_code: str = ""
+    errors: str = ""
+    test_results: str = ""
+    history: List[HistoryEntry] = Field(default_factory=list)
+    score: float = Field(default=0.0, ge=0.0, le=1.0)
+    done: bool = False
+class TaskDescriptor(BaseModel):
+    task_id: str
+    title: str
+    difficulty: Difficulty
+    task_kind: Optional[TaskKind] = None
+    task_description: str = ""
+    starter_code: str = ""
+    visible_tests: List[str] = Field(default_factory=list)
+    goal: str = ""
+    repo_summary: str = ""
+    changed_files: List[str] = Field(default_factory=list)
+    available_files: List[str] = Field(default_factory=list)
+    max_steps: int = Field(..., ge=1)
+class TaskSummary(BaseModel):
+    task_id: str
+    difficulty: Difficulty
+    title: str
+    goal: str = ""
+class ReviewFinding(BaseModel):
+    title: str
+    file_path: str = ""
+    line: Optional[int] = Field(default=None, ge=1)
+    category: Category = "bug"
+    severity: Severity = "warning"
+    rationale: str = ""
+    recommendation: str = ""
+    rule_id: str = ""
+    @property
+    def explanation(self) -> str:
+        return self.rationale
+    @property
+    def suggested_fix(self) -> str:
+        return self.recommendation
+class DirectReviewResponse(BaseModel):
+    issues: List[ReviewFinding] = Field(default_factory=list)
+    summary: str = ""
+    score: float = Field(default=0.0, ge=0.0, le=1.0)
+    improved_code: Optional[str] = None
+class TaskGrade(BaseModel):
+    score: float = Field(..., ge=0.0, le=1.0)
+    syntax_score: float = Field(default=0.0, ge=0.0, le=1.0)
+    tests_passed: int = Field(default=0, ge=0)
+    tests_total: int = Field(default=0, ge=0)
+    quality_score: float = Field(default=0.0, ge=0.0, le=1.0)
+    runtime_score: float = Field(default=0.0, ge=0.0, le=1.0)
+    timed_out: bool = False
+    matched_issue_ids: List[str] = Field(default_factory=list)
+    false_positives: int = Field(default=0, ge=0)
+    duplicate_findings: int = Field(default=0, ge=0)
+    matched_weight: float = Field(default=0.0, ge=0.0, le=1.0)
+    details: Dict[str, Any] = Field(default_factory=dict)
+class HealthResponse(BaseModel):
+    status: Literal["ok"] = "ok"
+    environment: str = "python_code_review_env"
+    task_count: int = Field(default=0, ge=0)

server/tasks/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+"""Self-contained task definitions for container builds."""
+from .task_bank import TaskSpec, get_task, list_task_descriptors, list_task_summaries, task_ids
+__all__ = [
+    "TaskSpec",
+    "get_task",
+    "list_task_descriptors",
+    "list_task_summaries",
+    "task_ids",
+]

server/tasks/task_bank.py ADDED Viewed

	@@ -0,0 +1,213 @@

+"""Deterministic task bank for self-contained server builds."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional
+from ..models import Difficulty, TaskDescriptor, TaskKind
+@dataclass(frozen=True)
+class TaskSpec:
+    task_id: str
+    title: str
+    difficulty: Difficulty
+    task_kind: TaskKind
+    task_description: str
+    starter_code: str
+    reference_code: str
+    visible_tests: List[str]
+    hidden_tests: List[str]
+    max_steps: int = 10
+    benchmark_entrypoint: Optional[str] = None
+    benchmark_builder: Optional[str] = None
+    benchmark_repeats: int = 1
+    benchmark_timeout_s: float = 2.0
+    style_max_line_length: int = 88
+    expected_quality_markers: List[str] = field(default_factory=list)
+    def to_descriptor(self) -> TaskDescriptor:
+        return TaskDescriptor(
+            task_id=self.task_id,
+            title=self.title,
+            difficulty=self.difficulty,
+            task_kind=self.task_kind,
+            task_description=self.task_description,
+            starter_code=self.starter_code,
+            visible_tests=list(self.visible_tests),
+            max_steps=self.max_steps,
+        )
+TASK_SYNTAX_FIX = TaskSpec(
+    task_id="syntax-fix-easy",
+    title="Fix a syntax-broken username normalizer",
+    difficulty="easy",
+    task_kind="syntax_fix",
+    task_description=(
+        "You are reviewing a utility function before merge. The submitted patch left "
+        "the function with syntax errors. Repair the code so it compiles and preserves "
+        "the intended behavior of trimming, lowercasing, and replacing spaces with underscores."
+    ),
+    starter_code='''def normalize_username(raw_name: str) -> str:
+    cleaned = raw_name.strip().lower(
+    if not cleaned:
+        return "anonymous"
+    return cleaned.replace(" ", "_")
+''',
+    reference_code='''def normalize_username(raw_name: str) -> str:
+    cleaned = raw_name.strip().lower()
+    if not cleaned:
+        return "anonymous"
+    return cleaned.replace(" ", "_")
+''',
+    visible_tests=[
+        "normalize_username('  Alice Smith  ') == 'alice_smith'",
+        "normalize_username('   ') == 'anonymous'",
+        "normalize_username('Bob') == 'bob'",
+    ],
+    hidden_tests=[
+        "normalize_username('  HELLO WORLD  ') == 'hello_world'",
+        "normalize_username('') == 'anonymous'",
+    ],
+    max_steps=8,
+)
+TASK_BUG_FIX = TaskSpec(
+    task_id="bug-fix-medium",
+    title="Repair invoice discount calculation logic",
+    difficulty="medium",
+    task_kind="bug_fix",
+    task_description=(
+        "A billing helper function is returning the wrong amount after applying discounts. "
+        "The function signature is correct, but the calculation logic is broken. "
+        "Inspect the implementation, run visible tests, and fix the bug so all tests pass. "
+        "Do not change the function signature or validation logic."
+    ),
+    starter_code='''from typing import Iterable
+def calculate_invoice_total(line_items: Iterable[int], discount_percent: int) -> int:
+    """Calculate invoice total with discount applied."""
+    if discount_percent < 0 or discount_percent > 100:
+        raise ValueError("discount_percent must be between 0 and 100")
+    subtotal = sum(line_items)
+    discounted_total = subtotal - (subtotal * discount_percent // 100)
+    return subtotal
+''',
+    reference_code='''from typing import Iterable
+def calculate_invoice_total(line_items: Iterable[int], discount_percent: int) -> int:
+    """Calculate invoice total with discount applied."""
+    if discount_percent < 0 or discount_percent > 100:
+        raise ValueError("discount_percent must be between 0 and 100")
+    subtotal = sum(line_items)
+    discounted_total = subtotal - (subtotal * discount_percent // 100)
+    return discounted_total
+''',
+    visible_tests=[
+        "calculate_invoice_total([1000, 2000], 0) == 3000",
+        "calculate_invoice_total([1000, 2000], 50) == 1500",
+        "calculate_invoice_total([1000], 10) == 900",
+        "calculate_invoice_total([], 0) == 0",
+    ],
+    hidden_tests=[
+        "calculate_invoice_total([100, 200, 300], 25) == 450",
+        "calculate_invoice_total([5000], 99) == 50",
+    ],
+    max_steps=10,
+)
+TASK_OPTIMIZATION = TaskSpec(
+    task_id="optimization-hard",
+    title="Optimize inefficient user activity summarization",
+    difficulty="hard",
+    task_kind="optimization",
+    task_description=(
+        "Code review found that `summarize_user_activity` is inefficient for large event streams. "
+        "The current implementation repeatedly scans the full event list for every user, making it O(n**2). "
+        "Refactor it to aggregate counts in one pass while preserving the sorted output contract. "
+        "Style and code quality also matter: use idiomatic Python, proper types, and clear logic. "
+        "All tests must pass, and the optimized version should be measurably faster."
+    ),
+    starter_code='''from typing import Iterable
+def summarize_user_activity(events: Iterable[dict]) -> list[tuple[str, int]]:
+    """Aggregate user activity counts."""
+    ordered_users = []
+    for event in events:
+        user_id = event["user_id"]
+        if user_id not in ordered_users:
+            ordered_users.append(user_id)
+    summary = []
+    for user_id in ordered_users:
+        count = 0
+        for event in events:
+            if event["user_id"] == user_id:
+                count += 1
+        summary.append((user_id, count))
+    return sorted(summary, key=lambda item: (-item[1], item[0]))
+''',
+    reference_code='''from collections import Counter
+from typing import Iterable
+def summarize_user_activity(events: Iterable[dict]) -> list[tuple[str, int]]:
+    """Aggregate user activity counts in one pass."""
+    counts = Counter(event["user_id"] for event in events)
+    return sorted(counts.items(), key=lambda item: (-item[1], item[0]))
+''',
+    visible_tests=[
+        "summarize_user_activity([{'user_id': 'alice'}, {'user_id': 'bob'}, {'user_id': 'alice'}]) == [('alice', 2), ('bob', 1)]",
+        "summarize_user_activity([{'user_id': 'z'}, {'user_id': 'a'}]) == [('a', 1), ('z', 1)]",
+        "summarize_user_activity([]) == []",
+        "summarize_user_activity([{'user_id': 'solo'}]) == [('solo', 1)]",
+    ],
+    hidden_tests=[
+        "summarize_user_activity([{'user_id': 'u2'}, {'user_id': 'u1'}, {'user_id': 'u2'}, {'user_id': 'u2'}, {'user_id': 'u1'}]) == [('u2', 3), ('u1', 2)]",
+    ],
+    max_steps=10,
+    benchmark_entrypoint="summarize_user_activity",
+    benchmark_builder='''def build_benchmark_events():
+    return [{"user_id": f"user_{index % 400}"} for index in range(6000)]''',
+    benchmark_repeats=3,
+    benchmark_timeout_s=1.0,
+    style_max_line_length=88,
+    expected_quality_markers=["Counter", "sorted"],
+)
+TASKS: Dict[str, TaskSpec] = {
+    "syntax-fix-easy": TASK_SYNTAX_FIX,
+    "bug-fix-medium": TASK_BUG_FIX,
+    "optimization-hard": TASK_OPTIMIZATION,
+}
+def task_ids() -> List[str]:
+    return ["syntax-fix-easy", "bug-fix-medium", "optimization-hard"]
+def get_task(task_id: str) -> TaskSpec:
+    if task_id not in TASKS:
+        raise ValueError(f"Task {task_id} not found. Available: {list(TASKS.keys())}")
+    return TASKS[task_id]
+def list_task_descriptors() -> List[TaskDescriptor]:
+    return [get_task(tid).to_descriptor() for tid in task_ids()]
+def list_task_summaries() -> List[TaskDescriptor]:
+    return list_task_descriptors()