uvpatel7271 commited on
Commit
558b89d
·
verified ·
1 Parent(s): 615272a

Upload folder using huggingface_hub

Browse files
Files changed (34) hide show
  1. Dockerfile +3 -3
  2. inference.py +37 -7
  3. pytest-cache-files-1f62ra1g/container_sim/server/Dockerfile +32 -0
  4. pytest-cache-files-1f62ra1g/container_sim/server/__init__.py +5 -0
  5. pytest-cache-files-1f62ra1g/container_sim/server/app.py +127 -0
  6. pytest-cache-files-1f62ra1g/container_sim/server/code_review_env_environment.py +9 -0
  7. pytest-cache-files-1f62ra1g/container_sim/server/code_review_environment.py +5 -0
  8. pytest-cache-files-1f62ra1g/container_sim/server/compat.py +89 -0
  9. pytest-cache-files-1f62ra1g/container_sim/server/env.py +1 -0
  10. pytest-cache-files-1f62ra1g/container_sim/server/env_safe.py +505 -0
  11. pytest-cache-files-1f62ra1g/container_sim/server/graders/__init__.py +17 -0
  12. pytest-cache-files-1f62ra1g/container_sim/server/graders/common.py +69 -0
  13. pytest-cache-files-1f62ra1g/container_sim/server/graders/optimization.py +135 -0
  14. pytest-cache-files-1f62ra1g/container_sim/server/graders/pytest_runner.py +121 -0
  15. pytest-cache-files-1f62ra1g/container_sim/server/graders/syntax.py +60 -0
  16. pytest-cache-files-1f62ra1g/container_sim/server/grading.py +147 -0
  17. pytest-cache-files-1f62ra1g/container_sim/server/models.py +149 -0
  18. pytest-cache-files-1f62ra1g/container_sim/server/python_env_environment.py +9 -0
  19. pytest-cache-files-1f62ra1g/container_sim/server/requirements.txt +6 -0
  20. pytest-cache-files-1f62ra1g/container_sim/server/static_review.py +273 -0
  21. pytest-cache-files-1f62ra1g/container_sim/server/task_bank.py +340 -0
  22. pytest-cache-files-1f62ra1g/container_sim/server/tasks/__init__.py +12 -0
  23. pytest-cache-files-1f62ra1g/container_sim/server/tasks/task_bank.py +213 -0
  24. server/app.py +21 -11
  25. server/compat.py +89 -0
  26. server/env_safe.py +26 -13
  27. server/graders/__init__.py +17 -0
  28. server/graders/common.py +69 -0
  29. server/graders/optimization.py +135 -0
  30. server/graders/pytest_runner.py +121 -0
  31. server/graders/syntax.py +60 -0
  32. server/models.py +149 -0
  33. server/tasks/__init__.py +12 -0
  34. server/tasks/task_bank.py +213 -0
Dockerfile CHANGED
@@ -10,11 +10,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
10
  && rm -rf /var/lib/apt/lists/*
11
 
12
  # Install Python dependencies
13
- COPY server/requirements.txt /app/server/requirements.txt
14
  RUN pip install --no-cache-dir -r /app/server/requirements.txt
15
 
16
- # Copy source code
17
- COPY . /app
18
 
19
  # Set environment variables
20
  ENV PYTHONUNBUFFERED=1
 
10
  && rm -rf /var/lib/apt/lists/*
11
 
12
  # Install Python dependencies
13
+ COPY requirements.txt /app/server/requirements.txt
14
  RUN pip install --no-cache-dir -r /app/server/requirements.txt
15
 
16
+ # Copy the self-contained server package
17
+ COPY . /app/server
18
 
19
  # Set environment variables
20
  ENV PYTHONUNBUFFERED=1
inference.py CHANGED
@@ -404,7 +404,7 @@ def run_env(client: Optional[Any], model: str) -> Dict[str, Any]:
404
 
405
 
406
  def format_step_message(result: Dict[str, Any]) -> str:
407
- """Format the only allowed STEP line for stdout."""
408
  try:
409
  fallback = bool(result.get("fallback", False))
410
  reason = safe_text(result.get("reason", "completed"), "completed").lower().replace(" ", "_")
@@ -429,21 +429,49 @@ def format_step_message(result: Dict[str, Any]) -> str:
429
  return "error handled: formatting_failed"
430
 
431
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
432
  def main() -> int:
433
  """Run the inference workflow and always terminate successfully."""
 
434
  step_message = "error handled: initialization_failed"
 
 
435
  try:
436
  model_name = safe_env("MODEL_NAME", DEFAULT_MODEL_NAME) or DEFAULT_MODEL_NAME
437
  client = create_client()
438
  result = run_env(client, model_name)
439
  step_message = format_step_message(result)
 
440
  except BaseException as exc:
441
  step_message = f"error handled: {safe_text(exc, 'unexpected_failure').lower().replace(' ', '_')[:64]}"
 
442
  finally:
443
  try:
444
- print("START")
445
- print(f"STEP: {step_message}")
446
- print("END")
447
  except Exception:
448
  pass
449
  return 0
@@ -454,9 +482,11 @@ if __name__ == "__main__":
454
  main()
455
  except BaseException:
456
  try:
457
- print("START")
458
- print("STEP: error handled: fatal_guard")
459
- print("END")
 
 
460
  except Exception:
461
  pass
462
  sys.exit(0)
 
404
 
405
 
406
  def format_step_message(result: Dict[str, Any]) -> str:
407
+ """Format the structured STEP payload for stdout."""
408
  try:
409
  fallback = bool(result.get("fallback", False))
410
  reason = safe_text(result.get("reason", "completed"), "completed").lower().replace(" ", "_")
 
429
  return "error handled: formatting_failed"
430
 
431
 
432
+ def format_start_message() -> str:
433
+ """Format the START payload for stdout."""
434
+ return "task=python_code_review_env"
435
+
436
+
437
+ def format_end_message(result: Optional[Dict[str, Any]]) -> str:
438
+ """Format the structured END payload for stdout."""
439
+ try:
440
+ payload = result or {}
441
+ status = safe_text(payload.get("status", "ok"), "ok").lower().replace(" ", "_")
442
+ score = safe_float(payload.get("score", 0.0), 0.0)
443
+ done = str(bool(payload.get("done", True))).lower()
444
+ fallback = str(bool(payload.get("fallback", True))).lower()
445
+ return f"task=python_code_review_env status={status} score={score:.4f} done={done} fallback={fallback}"
446
+ except Exception:
447
+ return "task=python_code_review_env status=ok score=0.0000 done=true fallback=true"
448
+
449
+
450
+ def emit_structured_output(start_message: str, step_message: str, end_message: str) -> None:
451
+ """Emit evaluator-readable output blocks to stdout."""
452
+ print(f"[START] {start_message}", flush=True)
453
+ print(f"[STEP] {step_message}", flush=True)
454
+ print(f"[END] {end_message}", flush=True)
455
+
456
+
457
  def main() -> int:
458
  """Run the inference workflow and always terminate successfully."""
459
+ start_message = format_start_message()
460
  step_message = "error handled: initialization_failed"
461
+ end_message = "task=python_code_review_env status=ok score=0.0000 done=true fallback=true"
462
+ result: Optional[Dict[str, Any]] = None
463
  try:
464
  model_name = safe_env("MODEL_NAME", DEFAULT_MODEL_NAME) or DEFAULT_MODEL_NAME
465
  client = create_client()
466
  result = run_env(client, model_name)
467
  step_message = format_step_message(result)
468
+ end_message = format_end_message(result)
469
  except BaseException as exc:
470
  step_message = f"error handled: {safe_text(exc, 'unexpected_failure').lower().replace(' ', '_')[:64]}"
471
+ end_message = format_end_message(result)
472
  finally:
473
  try:
474
+ emit_structured_output(start_message, step_message, end_message)
 
 
475
  except Exception:
476
  pass
477
  return 0
 
482
  main()
483
  except BaseException:
484
  try:
485
+ emit_structured_output(
486
+ format_start_message(),
487
+ "error handled: fatal_guard",
488
+ "task=python_code_review_env status=ok score=0.0000 done=true fallback=true",
489
+ )
490
  except Exception:
491
  pass
492
  sys.exit(0)
pytest-cache-files-1f62ra1g/container_sim/server/Dockerfile ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install system dependencies
6
+ RUN apt-get update && apt-get install -y --no-install-recommends \
7
+ gcc \
8
+ git \
9
+ curl \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ # Install Python dependencies
13
+ COPY requirements.txt /app/server/requirements.txt
14
+ RUN pip install --no-cache-dir -r /app/server/requirements.txt
15
+
16
+ # Copy the self-contained server package
17
+ COPY . /app/server
18
+
19
+ # Set environment variables
20
+ ENV PYTHONUNBUFFERED=1
21
+ ENV HOST=0.0.0.0
22
+ ENV PORT=8000
23
+ ENV WORKERS=1
24
+ ENV MAX_CONCURRENT_ENVS=16
25
+
26
+ # Health check
27
+ HEALTHCHECK --interval=30s --timeout=5s --start-period=15s --retries=3 \
28
+ CMD curl -f http://localhost:${PORT}/health || exit 1
29
+
30
+ # Run FastAPI app
31
+ EXPOSE ${PORT}
32
+ CMD ["python", "-m", "server.app"]
pytest-cache-files-1f62ra1g/container_sim/server/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """Server exports for the Python code review environment."""
2
+
3
+ from .code_review_environment import CodeReviewEnvironment, PythonCodeReviewEnvironment, PythonEnvironment
4
+
5
+ __all__ = ["PythonEnvironment", "PythonCodeReviewEnvironment", "CodeReviewEnvironment"]
pytest-cache-files-1f62ra1g/container_sim/server/app.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """FastAPI application for the Python code review environment."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+
7
+ from fastapi import APIRouter, HTTPException
8
+ from fastapi.responses import RedirectResponse
9
+
10
+ try:
11
+ from compat import create_app
12
+ from models import (
13
+ HealthResponse,
14
+ PythonCodeReviewAction,
15
+ PythonCodeReviewObservation,
16
+ PythonCodeReviewState,
17
+ TaskDescriptor,
18
+ TaskGrade,
19
+ )
20
+ except Exception:
21
+ from .compat import create_app
22
+ from .models import (
23
+ HealthResponse,
24
+ PythonCodeReviewAction,
25
+ PythonCodeReviewObservation,
26
+ PythonCodeReviewState,
27
+ TaskDescriptor,
28
+ TaskGrade,
29
+ )
30
+ from server.env import PythonCodeReviewEnvironment
31
+
32
+
33
+ try:
34
+ MAX_CONCURRENT_ENVS = max(int(os.getenv("MAX_CONCURRENT_ENVS", "16")), 1)
35
+ except Exception:
36
+ MAX_CONCURRENT_ENVS = 16
37
+
38
+ python_env = PythonCodeReviewEnvironment(verbose=False)
39
+ app = create_app(
40
+ PythonCodeReviewEnvironment,
41
+ PythonCodeReviewAction,
42
+ PythonCodeReviewObservation,
43
+ max_concurrent_envs=MAX_CONCURRENT_ENVS,
44
+ )
45
+ router = APIRouter(tags=["python-code-review"])
46
+
47
+
48
+ @router.get("/", include_in_schema=False)
49
+ def root() -> RedirectResponse:
50
+ """Redirect root to API documentation."""
51
+ return RedirectResponse(url="/docs")
52
+
53
+
54
+ @router.get("/health", response_model=HealthResponse)
55
+ def health() -> HealthResponse:
56
+ """Health check endpoint for deployment monitoring."""
57
+ return python_env.health()
58
+
59
+
60
+ @router.get("/tasks", response_model=list)
61
+ def list_tasks() -> list:
62
+ """List all available deterministic tasks."""
63
+ return python_env.list_task_summaries()
64
+
65
+
66
+ @router.get("/tasks/{task_id}", response_model=object)
67
+ def get_task(task_id: str) -> object:
68
+ """Get a specific task by ID."""
69
+ try:
70
+ return python_env.get_task(task_id)
71
+ except ValueError as exc:
72
+ raise HTTPException(status_code=404, detail=str(exc)) from exc
73
+
74
+
75
+ @router.post("/tasks/{task_id}/grade", response_model=TaskGrade)
76
+ def grade_task(task_id: str, payload: PythonCodeReviewAction) -> TaskGrade:
77
+ """Grade code submission for a task without running an episode."""
78
+ if payload.action_type != "edit_code" or not payload.code:
79
+ raise HTTPException(
80
+ status_code=400,
81
+ detail="Requires action_type='edit_code' with code parameter."
82
+ )
83
+ try:
84
+ return python_env.grade_task_submission(task_id=task_id, code=payload.code)
85
+ except ValueError as exc:
86
+ raise HTTPException(status_code=404, detail=str(exc)) from exc
87
+
88
+
89
+ @router.post("/state", response_model=PythonCodeReviewState)
90
+ def get_state_post() -> RedirectResponse:
91
+ """Redirect POST /state to GET for compatibility."""
92
+ return RedirectResponse(url="/state", status_code=303)
93
+
94
+
95
+ app.include_router(router)
96
+
97
+
98
+ def _prioritize_route(path: str, methods: set[str]) -> None:
99
+ """Move a matching custom route ahead of default OpenEnv routes."""
100
+ try:
101
+ for index in range(len(app.router.routes) - 1, -1, -1):
102
+ route = app.router.routes[index]
103
+ route_path = getattr(route, "path", None)
104
+ route_methods = set(getattr(route, "methods", set()) or set())
105
+ if route_path == path and methods.issubset(route_methods):
106
+ app.router.routes.insert(0, app.router.routes.pop(index))
107
+ break
108
+ except Exception:
109
+ pass
110
+
111
+
112
+ _prioritize_route("/health", {"GET"})
113
+
114
+
115
+ def main(host: str = "0.0.0.0", port: int = 8000) -> None:
116
+ """Run the FastAPI application with uvicorn."""
117
+ import uvicorn
118
+ uvicorn.run(
119
+ app,
120
+ host=os.getenv("HOST", host),
121
+ port=int(os.getenv("PORT", str(port))),
122
+ )
123
+
124
+
125
+ if __name__ == "__main__":
126
+ main()
127
+
pytest-cache-files-1f62ra1g/container_sim/server/code_review_env_environment.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ """Compatibility shim for older imports."""
2
+
3
+ try:
4
+ from server.code_review_environment import CodeReviewEnvironment
5
+ except ModuleNotFoundError: # pragma: no cover
6
+ from .code_review_environment import CodeReviewEnvironment
7
+
8
+
9
+ __all__ = ["CodeReviewEnvironment"]
pytest-cache-files-1f62ra1g/container_sim/server/code_review_environment.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """Compatibility wrapper for older imports."""
2
+
3
+ from .env import CodeReviewEnvironment, PythonCodeReviewEnvironment, PythonEnvironment
4
+
5
+ __all__ = ["CodeReviewEnvironment", "PythonCodeReviewEnvironment", "PythonEnvironment"]
pytest-cache-files-1f62ra1g/container_sim/server/compat.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Compatibility helpers for OpenEnv and FastMCP runtime drift."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+ import types
7
+ from typing import Any
8
+
9
+
10
+ def install_openenv_fastmcp_compat() -> None:
11
+ """Patch FastMCP API differences so older OpenEnv builds keep importing."""
12
+ try:
13
+ import fastmcp # type: ignore
14
+ except Exception:
15
+ return
16
+
17
+ try:
18
+ if not hasattr(fastmcp, "Client"):
19
+ class CompatClient:
20
+ """Minimal async MCP client used for legacy OpenEnv imports."""
21
+
22
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
23
+ self.args = args
24
+ self.kwargs = kwargs
25
+
26
+ async def __aenter__(self) -> "CompatClient":
27
+ return self
28
+
29
+ async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> bool:
30
+ return False
31
+
32
+ async def list_tools(self) -> list[Any]:
33
+ return []
34
+
35
+ async def call_tool(self, tool_name: str, arguments: dict[str, Any]) -> Any:
36
+ raise RuntimeError(
37
+ f"MCP client compatibility mode cannot call tool: {tool_name}"
38
+ )
39
+
40
+ fastmcp.Client = CompatClient # type: ignore[attr-defined]
41
+ except Exception:
42
+ pass
43
+
44
+ try:
45
+ client_pkg = sys.modules.get("fastmcp.client")
46
+ if client_pkg is None:
47
+ client_pkg = types.ModuleType("fastmcp.client")
48
+ sys.modules["fastmcp.client"] = client_pkg
49
+
50
+ client_mod = sys.modules.get("fastmcp.client.client")
51
+ if client_mod is None:
52
+ client_mod = types.ModuleType("fastmcp.client.client")
53
+ sys.modules["fastmcp.client.client"] = client_mod
54
+
55
+ if not hasattr(client_mod, "CallToolResult"):
56
+ class CallToolResult:
57
+ """Compatibility container for legacy OpenEnv response handling."""
58
+
59
+ def __init__(
60
+ self,
61
+ content: Any = None,
62
+ structured_content: Any = None,
63
+ meta: Any = None,
64
+ data: Any = None,
65
+ is_error: bool = False,
66
+ ) -> None:
67
+ self.content = content
68
+ self.structured_content = structured_content
69
+ self.meta = meta
70
+ self.data = data
71
+ self.is_error = is_error
72
+
73
+ client_mod.CallToolResult = CallToolResult
74
+
75
+ client_pkg.client = client_mod # type: ignore[attr-defined]
76
+ except Exception:
77
+ pass
78
+
79
+
80
+ install_openenv_fastmcp_compat()
81
+
82
+
83
+ from openenv.core.env_server.http_server import create_app as openenv_create_app
84
+ from openenv.core.env_server.interfaces import Environment
85
+ from openenv.core.env_server.types import Action, Observation, State
86
+
87
+
88
+ create_app = openenv_create_app
89
+
pytest-cache-files-1f62ra1g/container_sim/server/env.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .env_safe import * # noqa: F401,F403
pytest-cache-files-1f62ra1g/container_sim/server/env_safe.py ADDED
@@ -0,0 +1,505 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Safe OpenEnv environment for deterministic Python code repair tasks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Optional
6
+ from uuid import uuid4
7
+
8
+ try:
9
+ from compat import Environment
10
+ from graders import grade_task
11
+ from models import (
12
+ HealthResponse,
13
+ HistoryEntry,
14
+ PythonCodeReviewAction,
15
+ PythonCodeReviewObservation,
16
+ PythonCodeReviewState,
17
+ RewardDetails,
18
+ TaskGrade,
19
+ )
20
+ from tasks import TaskSpec, get_task as load_task, list_task_summaries, task_ids
21
+ except Exception:
22
+ from .compat import Environment
23
+ from .graders import grade_task
24
+ from .models import (
25
+ HealthResponse,
26
+ HistoryEntry,
27
+ PythonCodeReviewAction,
28
+ PythonCodeReviewObservation,
29
+ PythonCodeReviewState,
30
+ RewardDetails,
31
+ TaskGrade,
32
+ )
33
+ from .tasks import TaskSpec, get_task as load_task, list_task_summaries, task_ids
34
+
35
+
36
+ INVALID_ACTION_PENALTY = 0.10
37
+ NO_PROGRESS_PENALTY = 0.08
38
+ REPEATED_ACTION_PENALTY = 0.05
39
+ BASE_STEP_PENALTY = 0.02
40
+ ANALYZE_STEP_PENALTY = 0.01
41
+ SUBMIT_COMPLETION_BONUS = 0.30
42
+ TIMEOUT_PENALTY = 0.12
43
+ VALID_ACTIONS = {"analyze_code", "edit_code", "run_tests", "submit_solution"}
44
+
45
+
46
+ def _clamp(value: float, low: float = 0.0, high: float = 1.0) -> float:
47
+ """Clamp a scalar to a bounded numeric interval."""
48
+ try:
49
+ return max(low, min(high, float(value)))
50
+ except Exception:
51
+ return low
52
+
53
+
54
+ def _safe_text(value: Any, default: str = "") -> str:
55
+ """Convert values into short stable strings."""
56
+ try:
57
+ text = str(value)
58
+ except Exception:
59
+ return default
60
+ text = " ".join(text.split())
61
+ return text[:240] if text else default
62
+
63
+
64
+ class PythonCodeReviewEnvironment(
65
+ Environment[PythonCodeReviewAction, PythonCodeReviewObservation, PythonCodeReviewState]
66
+ ):
67
+ """Deterministic, bounded, evaluator-safe environment for code repair tasks."""
68
+
69
+ SUPPORTS_CONCURRENT_SESSIONS = True
70
+
71
+ def __init__(self, verbose: bool = False) -> None:
72
+ super().__init__()
73
+ self._verbose = bool(verbose)
74
+ self._task_order = self._safe_task_order()
75
+ self._task_cursor = -1
76
+ self._task: Optional[TaskSpec] = None
77
+ self._state = PythonCodeReviewState(episode_id=str(uuid4()))
78
+ self._done = False
79
+ self._last_status = "Call reset() to start."
80
+ self._last_reward = RewardDetails(value=0.0, reason="Environment initialized.")
81
+ self._metrics = self._blank_metrics()
82
+ self._last_action_type = ""
83
+
84
+ def reset(
85
+ self,
86
+ seed: Optional[int] = None,
87
+ episode_id: Optional[str] = None,
88
+ task_id: Optional[str] = None,
89
+ **_: object,
90
+ ) -> PythonCodeReviewObservation:
91
+ """Reset the environment for a deterministic task and return an observation."""
92
+ del seed
93
+ try:
94
+ self._reset_rubric()
95
+ except Exception:
96
+ pass
97
+
98
+ task = self._select_task(task_id)
99
+ self._task = task
100
+ self._done = False
101
+ self._metrics = self._blank_metrics()
102
+ self._last_action_type = ""
103
+ self._last_status = "Inspect the code, run checks, edit the code, then submit."
104
+ self._last_reward = RewardDetails(
105
+ value=0.0,
106
+ reason="Episode reset.",
107
+ prev_score=0.0,
108
+ curr_score=0.0,
109
+ )
110
+ self._state = PythonCodeReviewState(
111
+ episode_id=episode_id or str(uuid4()),
112
+ step_count=0,
113
+ task_id=task.task_id,
114
+ difficulty=task.difficulty,
115
+ task_kind=task.task_kind,
116
+ attempts_remaining=max(int(task.max_steps), 1),
117
+ current_code=task.starter_code,
118
+ errors="",
119
+ test_results="No checks run yet.",
120
+ history=[],
121
+ score=0.0,
122
+ done=False,
123
+ )
124
+ return self._build_observation()
125
+
126
+ def step(
127
+ self,
128
+ action: PythonCodeReviewAction,
129
+ timeout_s: Optional[float] = None,
130
+ **_: object,
131
+ ) -> PythonCodeReviewObservation:
132
+ """Execute one safe environment step and always return a valid observation."""
133
+ del timeout_s
134
+ try:
135
+ if self._task is None:
136
+ return self.reset()
137
+
138
+ if self._done:
139
+ self._last_status = "Episode already completed. Call reset() to continue."
140
+ self._last_reward = RewardDetails(
141
+ value=-INVALID_ACTION_PENALTY,
142
+ invalid_action_penalty=INVALID_ACTION_PENALTY,
143
+ reason="Episode already completed.",
144
+ prev_score=self._metrics["score"],
145
+ curr_score=self._metrics["score"],
146
+ code_changed=False,
147
+ )
148
+ return self._build_observation()
149
+
150
+ self._state.step_count += 1
151
+ action_type = _safe_text(getattr(action, "action_type", "analyze_code"), "analyze_code")
152
+ code = getattr(action, "code", None)
153
+
154
+ if action_type == "analyze_code":
155
+ self._handle_scored_action(action_type=action_type, candidate_code=self._state.current_code, include_hidden=False)
156
+ elif action_type == "run_tests":
157
+ self._handle_scored_action(action_type=action_type, candidate_code=self._state.current_code, include_hidden=False)
158
+ elif action_type == "edit_code":
159
+ self._handle_edit(code)
160
+ elif action_type == "submit_solution":
161
+ self._handle_scored_action(action_type=action_type, candidate_code=self._state.current_code, include_hidden=True)
162
+ self._done = True
163
+ else:
164
+ self._apply_invalid_action(f"Unsupported action_type '{action_type}'.")
165
+
166
+ self._state.attempts_remaining = max(self._task.max_steps - self._state.step_count, 0)
167
+ if self._state.attempts_remaining == 0 and not self._done:
168
+ self._auto_submit()
169
+
170
+ self._state.done = self._done
171
+ return self._build_observation()
172
+ except Exception as exc:
173
+ self._apply_invalid_action(f"Step failure handled: {_safe_text(exc, 'unknown_error')}")
174
+ self._state.done = self._done
175
+ return self._build_observation()
176
+
177
+ @property
178
+ def state(self) -> PythonCodeReviewState:
179
+ """Return a deep copy of the current environment state."""
180
+ try:
181
+ return self._state.model_copy(deep=True)
182
+ except Exception:
183
+ return PythonCodeReviewState(episode_id=str(uuid4()))
184
+
185
+ def list_task_summaries(self) -> list[object]:
186
+ """Return public task summaries."""
187
+ try:
188
+ return list_task_summaries()
189
+ except Exception:
190
+ return []
191
+
192
+ def get_task(self, task_id: str) -> object:
193
+ """Return a single public task descriptor."""
194
+ return self._select_task(task_id).to_descriptor()
195
+
196
+ def health(self) -> HealthResponse:
197
+ """Return a simple health response."""
198
+ return HealthResponse(task_count=len(self._task_order))
199
+
200
+ def grade_task_submission(self, task_id: str, code: str) -> TaskGrade:
201
+ """Grade a task submission outside an episode without raising."""
202
+ try:
203
+ task = self._select_task(task_id)
204
+ return self._safe_grade(task=task, candidate_code=code, include_hidden=True)
205
+ except Exception as exc:
206
+ return TaskGrade(score=0.0, details={"error": _safe_text(exc, "grading_failed")})
207
+
208
+ def run_tests(self, code: str, include_hidden: bool = False) -> tuple[float, dict[str, int], TaskGrade]:
209
+ """Run deterministic grading and return score plus test summary."""
210
+ task = self._task or self._select_task(None)
211
+ grade = self._safe_grade(task=task, candidate_code=code, include_hidden=include_hidden)
212
+ return (
213
+ _clamp(grade.score),
214
+ {"passed": int(grade.tests_passed), "total": int(grade.tests_total)},
215
+ grade,
216
+ )
217
+
218
+ def apply_action(self, action: PythonCodeReviewAction) -> str:
219
+ """Return the candidate code implied by the action."""
220
+ if getattr(action, "action_type", "") == "edit_code":
221
+ code = getattr(action, "code", None)
222
+ return str(code) if code is not None else self._state.current_code
223
+ return self._state.current_code
224
+
225
+ def compute_reward(
226
+ self,
227
+ action_type: str,
228
+ previous_metrics: dict[str, float],
229
+ current_metrics: dict[str, float],
230
+ grade: TaskGrade,
231
+ code_changed: bool,
232
+ invalid_action: bool = False,
233
+ ) -> RewardDetails:
234
+ """Compute a bounded dynamic reward with progress and efficiency shaping."""
235
+ prev_score = _clamp(previous_metrics.get("score", 0.0))
236
+ curr_score = _clamp(current_metrics.get("score", 0.0))
237
+ score_delta = curr_score - prev_score
238
+ test_delta = current_metrics.get("test_fraction", 0.0) - previous_metrics.get("test_fraction", 0.0)
239
+ syntax_delta = current_metrics.get("syntax_score", 0.0) - previous_metrics.get("syntax_score", 0.0)
240
+ quality_delta = current_metrics.get("quality_score", 0.0) - previous_metrics.get("quality_score", 0.0)
241
+
242
+ step_penalty = BASE_STEP_PENALTY + (ANALYZE_STEP_PENALTY if action_type == "analyze_code" else 0.0)
243
+ repeated_penalty = REPEATED_ACTION_PENALTY if action_type == self._last_action_type else 0.0
244
+ no_progress = (
245
+ score_delta <= 1e-9
246
+ and test_delta <= 1e-9
247
+ and syntax_delta <= 1e-9
248
+ and quality_delta <= 1e-9
249
+ and not code_changed
250
+ )
251
+ stagnation_penalty = NO_PROGRESS_PENALTY if no_progress and not invalid_action else 0.0
252
+ regression_penalty = max(-score_delta, 0.0) * 0.6 + repeated_penalty + step_penalty
253
+ invalid_penalty = INVALID_ACTION_PENALTY if invalid_action else 0.0
254
+ timeout_penalty = TIMEOUT_PENALTY if bool(grade.timed_out) else 0.0
255
+
256
+ progress_reward = max(score_delta, 0.0) * 0.7
257
+ syntax_reward = max(syntax_delta, 0.0) * 0.5
258
+ test_reward = max(test_delta, 0.0) * 1.0
259
+ quality_bonus = max(quality_delta, 0.0) * 0.2
260
+ correctness_bonus = SUBMIT_COMPLETION_BONUS if action_type == "submit_solution" and curr_score >= 0.999 else 0.0
261
+
262
+ reward_value = (
263
+ progress_reward
264
+ + syntax_reward
265
+ + test_reward
266
+ + quality_bonus
267
+ + correctness_bonus
268
+ - stagnation_penalty
269
+ - regression_penalty
270
+ - invalid_penalty
271
+ - timeout_penalty
272
+ )
273
+ reward_value = max(-1.0, min(1.0, round(reward_value, 6)))
274
+ return RewardDetails(
275
+ value=reward_value,
276
+ syntax_reward=round(syntax_reward, 6),
277
+ test_reward=round(test_reward, 6),
278
+ quality_bonus=round(quality_bonus, 6),
279
+ correctness_bonus=round(correctness_bonus, 6),
280
+ progress_delta=round(progress_reward, 6),
281
+ stagnation_penalty=round(stagnation_penalty, 6),
282
+ regression_penalty=round(regression_penalty, 6),
283
+ invalid_action_penalty=round(invalid_penalty, 6),
284
+ timeout_penalty=round(timeout_penalty, 6),
285
+ reason=f"{action_type} reward computed safely",
286
+ prev_score=round(prev_score, 6),
287
+ curr_score=round(curr_score, 6),
288
+ code_changed=bool(code_changed),
289
+ )
290
+
291
+ def _safe_task_order(self) -> list[str]:
292
+ """Load deterministic task ids with a hard fallback."""
293
+ try:
294
+ loaded = list(task_ids())
295
+ if loaded:
296
+ return [str(task_id) for task_id in loaded]
297
+ except Exception:
298
+ pass
299
+ return ["syntax-fix-easy", "bug-fix-medium", "optimization-hard"]
300
+
301
+ def _blank_metrics(self) -> dict[str, float]:
302
+ """Return an empty metric snapshot."""
303
+ return {
304
+ "score": 0.0,
305
+ "test_fraction": 0.0,
306
+ "syntax_score": 0.0,
307
+ "quality_score": 0.0,
308
+ }
309
+
310
+ def _select_task(self, task_id: Optional[str]) -> TaskSpec:
311
+ """Select the requested task or advance deterministically."""
312
+ try:
313
+ if task_id:
314
+ task = load_task(task_id)
315
+ if task.task_id in self._task_order:
316
+ self._task_cursor = self._task_order.index(task.task_id)
317
+ return task
318
+ except Exception:
319
+ pass
320
+
321
+ try:
322
+ self._task_cursor = (self._task_cursor + 1) % len(self._task_order)
323
+ return load_task(self._task_order[self._task_cursor])
324
+ except Exception:
325
+ return load_task("syntax-fix-easy")
326
+
327
+ def _safe_grade(self, task: TaskSpec, candidate_code: str, include_hidden: bool) -> TaskGrade:
328
+ """Run grading without allowing exceptions to escape."""
329
+ try:
330
+ return grade_task(candidate_code, task, include_hidden=include_hidden)
331
+ except Exception as exc:
332
+ return TaskGrade(
333
+ score=0.0,
334
+ syntax_score=0.0,
335
+ tests_passed=0,
336
+ tests_total=max(len(task.visible_tests), 1),
337
+ details={"compile_error": "", "error": _safe_text(exc, "grading_failed")},
338
+ )
339
+
340
+ def _metrics_from_grade(self, grade: TaskGrade) -> dict[str, float]:
341
+ """Derive normalized reward metrics from a grading result."""
342
+ tests_total = max(int(grade.tests_total), 0)
343
+ tests_passed = max(int(grade.tests_passed), 0)
344
+ test_fraction = (tests_passed / tests_total) if tests_total else _clamp(grade.syntax_score)
345
+ return {
346
+ "score": _clamp(grade.score),
347
+ "test_fraction": _clamp(test_fraction),
348
+ "syntax_score": _clamp(grade.syntax_score),
349
+ "quality_score": _clamp(grade.quality_score),
350
+ }
351
+
352
+ def _format_test_results(self, grade: TaskGrade, include_hidden: bool) -> str:
353
+ """Format test execution results for the observation."""
354
+ compile_error = _safe_text(grade.details.get("compile_error", ""), "")
355
+ scope = "all checks" if include_hidden else "visible checks"
356
+ if compile_error:
357
+ return f"{scope}: compile error: {compile_error}"
358
+ if grade.timed_out:
359
+ return f"{scope}: execution timed out"
360
+ if self._task and self._task.task_kind == "syntax_fix":
361
+ return "visible checks: code compiles successfully"
362
+ return f"{scope}: {int(grade.tests_passed)}/{int(grade.tests_total)} passing"
363
+
364
+ def _build_status(self, action_type: str, grade: TaskGrade) -> str:
365
+ """Build a human-readable status message."""
366
+ if action_type == "submit_solution":
367
+ return f"Solution submitted. Final score: {_clamp(grade.score):.3f}"
368
+ if action_type == "edit_code":
369
+ if grade.details.get("compile_error"):
370
+ return "Code updated, but syntax issues remain."
371
+ return "Code updated and evaluated."
372
+ if action_type == "run_tests":
373
+ return "Test run completed."
374
+ if action_type == "analyze_code":
375
+ return "Analysis completed."
376
+ return "Action handled safely."
377
+
378
+ def _apply_grade_to_state(self, grade: TaskGrade, include_hidden: bool) -> None:
379
+ """Update environment state from the latest grading result."""
380
+ compile_error = _safe_text(grade.details.get("compile_error", ""), "")
381
+ self._state.score = _clamp(grade.score)
382
+ self._state.errors = compile_error
383
+ self._state.test_results = self._format_test_results(grade, include_hidden=include_hidden)
384
+
385
+ def _handle_scored_action(self, action_type: str, candidate_code: str, include_hidden: bool) -> None:
386
+ """Grade code, update state, and compute reward for a valid action."""
387
+ task = self._task or self._select_task(None)
388
+ previous_metrics = dict(self._metrics)
389
+ prior_code = self._state.current_code
390
+ code_changed = candidate_code.strip() != prior_code.strip()
391
+ if action_type == "edit_code":
392
+ self._state.current_code = candidate_code
393
+ grade = self._safe_grade(task=task, candidate_code=self._state.current_code, include_hidden=include_hidden)
394
+ current_metrics = self._metrics_from_grade(grade)
395
+ self._apply_grade_to_state(grade, include_hidden=include_hidden)
396
+ self._last_reward = self.compute_reward(
397
+ action_type=action_type,
398
+ previous_metrics=previous_metrics,
399
+ current_metrics=current_metrics,
400
+ grade=grade,
401
+ code_changed=code_changed,
402
+ invalid_action=False,
403
+ )
404
+ self._last_status = self._build_status(action_type, grade)
405
+ self._metrics = current_metrics
406
+ self._last_action_type = action_type
407
+ self._append_history(action_type, self._last_status, self._last_reward.value)
408
+
409
+ def _handle_edit(self, code: Optional[str]) -> None:
410
+ """Validate edit input and evaluate the new candidate code."""
411
+ safe_code = (code or "").strip()
412
+ if not safe_code:
413
+ self._apply_invalid_action("edit_code requires code parameter.")
414
+ return
415
+ self._handle_scored_action(action_type="edit_code", candidate_code=safe_code, include_hidden=False)
416
+
417
+ def _apply_invalid_action(self, reason: str) -> None:
418
+ """Record an invalid action without crashing the episode."""
419
+ previous_metrics = dict(self._metrics)
420
+ grade = TaskGrade(score=previous_metrics["score"], syntax_score=previous_metrics["syntax_score"])
421
+ self._last_reward = self.compute_reward(
422
+ action_type="invalid",
423
+ previous_metrics=previous_metrics,
424
+ current_metrics=previous_metrics,
425
+ grade=grade,
426
+ code_changed=False,
427
+ invalid_action=True,
428
+ )
429
+ self._last_status = reason
430
+ self._append_history("analyze_code", reason, self._last_reward.value)
431
+
432
+ def _auto_submit(self) -> None:
433
+ """Finalize the episode when attempts are exhausted."""
434
+ task = self._task or self._select_task(None)
435
+ grade = self._safe_grade(task=task, candidate_code=self._state.current_code, include_hidden=True)
436
+ self._apply_grade_to_state(grade, include_hidden=True)
437
+ self._done = True
438
+ self._state.done = True
439
+ self._last_status = f"Auto-submitted. Final score: {_clamp(grade.score):.3f}"
440
+
441
+ def _append_history(self, action_type: str, status: str, reward: float) -> None:
442
+ """Append one action record to the episode history."""
443
+ try:
444
+ stable_action = action_type if action_type in VALID_ACTIONS else "analyze_code"
445
+ self._state.history.append(
446
+ HistoryEntry(
447
+ step=max(int(self._state.step_count), 0),
448
+ action_type=stable_action,
449
+ status=_safe_text(status, "handled"),
450
+ reward=float(reward),
451
+ )
452
+ )
453
+ except Exception:
454
+ pass
455
+
456
+ def _build_observation(self) -> PythonCodeReviewObservation:
457
+ """Build a valid observation from current state."""
458
+ task = self._task
459
+ try:
460
+ return PythonCodeReviewObservation(
461
+ task_id=self._state.task_id or "",
462
+ title=task.title if task else "",
463
+ difficulty=self._state.difficulty or "easy",
464
+ task_kind=self._state.task_kind,
465
+ task_description=task.task_description if task else "",
466
+ current_code=self._state.current_code,
467
+ errors=self._state.errors,
468
+ test_results=self._state.test_results,
469
+ visible_tests=list(task.visible_tests) if task else [],
470
+ history=list(self._state.history),
471
+ attempts_remaining=max(int(self._state.attempts_remaining), 0),
472
+ last_action_status=self._last_status,
473
+ score=_clamp(self._state.score),
474
+ reward_details=self._last_reward,
475
+ reward=self._last_reward.value,
476
+ done=bool(self._state.done),
477
+ metadata={
478
+ "prev_score": self._last_reward.prev_score,
479
+ "curr_score": self._last_reward.curr_score,
480
+ },
481
+ )
482
+ except Exception as exc:
483
+ return PythonCodeReviewObservation(
484
+ task_id=self._state.task_id or "",
485
+ title="",
486
+ difficulty="easy",
487
+ task_kind=None,
488
+ task_description="",
489
+ current_code=getattr(self._state, "current_code", ""),
490
+ errors=_safe_text(exc, "observation_build_failed"),
491
+ test_results="visible checks: unavailable",
492
+ visible_tests=[],
493
+ history=[],
494
+ attempts_remaining=0,
495
+ last_action_status="Observation fallback returned safely.",
496
+ score=0.0,
497
+ reward_details=RewardDetails(value=0.0, reason="Observation fallback."),
498
+ reward=0.0,
499
+ done=bool(getattr(self._state, "done", False)),
500
+ metadata={},
501
+ )
502
+
503
+
504
+ PythonEnvironment = PythonCodeReviewEnvironment
505
+ CodeReviewEnvironment = PythonCodeReviewEnvironment
pytest-cache-files-1f62ra1g/container_sim/server/graders/__init__.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Deterministic graders for self-contained server builds."""
2
+
3
+ from .common import clamp_score
4
+ from .optimization import grade_optimization_task
5
+ from .pytest_runner import PytestExecution, run_pytest_suite
6
+ from .syntax import grade_bug_fix_task, grade_syntax_task, grade_task
7
+
8
+ __all__ = [
9
+ "PytestExecution",
10
+ "clamp_score",
11
+ "grade_bug_fix_task",
12
+ "grade_optimization_task",
13
+ "grade_syntax_task",
14
+ "grade_task",
15
+ "run_pytest_suite",
16
+ ]
17
+
pytest-cache-files-1f62ra1g/container_sim/server/graders/common.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Shared deterministic scoring helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import ast
6
+ import difflib
7
+ import traceback
8
+ from typing import Tuple
9
+
10
+
11
+ def clamp_score(value: float) -> float:
12
+ return max(0.0, min(1.0, round(value, 6)))
13
+
14
+
15
+ def syntax_error_message(code: str) -> str:
16
+ try:
17
+ ast.parse(code)
18
+ except SyntaxError as exc:
19
+ return f"{exc.msg} (line {exc.lineno}, column {exc.offset})"
20
+ except Exception:
21
+ return traceback.format_exc(limit=1).strip()
22
+ return ""
23
+
24
+
25
+ def compiles(code: str) -> bool:
26
+ try:
27
+ compile(code, "<candidate>", "exec")
28
+ except Exception:
29
+ return False
30
+ return True
31
+
32
+
33
+ def normalized_diff_score(code: str, reference_code: str) -> float:
34
+ ratio = difflib.SequenceMatcher(
35
+ a="".join(code.split()),
36
+ b="".join(reference_code.split()),
37
+ ).ratio()
38
+ return clamp_score(ratio)
39
+
40
+
41
+ def style_score(code: str, max_line_length: int = 88) -> float:
42
+ lines = code.splitlines() or [""]
43
+ line_length_ok = sum(1 for line in lines if len(line) <= max_line_length) / len(lines)
44
+ tab_ok = 1.0 if all("\t" not in line for line in lines) else 0.0
45
+ trailing_ws_ok = 1.0 if all(line == line.rstrip() for line in lines) else 0.0
46
+ return clamp_score((line_length_ok * 0.6) + (tab_ok * 0.2) + (trailing_ws_ok * 0.2))
47
+
48
+
49
+ def nested_loop_depth(tree: ast.AST) -> int:
50
+ best = 0
51
+
52
+ def walk(node: ast.AST, depth: int) -> None:
53
+ nonlocal best
54
+ if isinstance(node, (ast.For, ast.AsyncFor, ast.While)):
55
+ depth += 1
56
+ best = max(best, depth)
57
+ for child in ast.iter_child_nodes(node):
58
+ walk(child, depth)
59
+
60
+ walk(tree, 0)
61
+ return best
62
+
63
+
64
+ def compile_tree(code: str) -> Tuple[ast.AST | None, str]:
65
+ try:
66
+ return ast.parse(code), ""
67
+ except SyntaxError as exc:
68
+ return None, f"{exc.msg} (line {exc.lineno}, column {exc.offset})"
69
+
pytest-cache-files-1f62ra1g/container_sim/server/graders/optimization.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Deterministic grading for optimization tasks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import subprocess
7
+ import sys
8
+ import tempfile
9
+ from pathlib import Path
10
+
11
+ from .common import clamp_score, compile_tree, nested_loop_depth, style_score
12
+ from .pytest_runner import run_pytest_suite
13
+ from ..models import TaskGrade
14
+ from ..tasks.task_bank import TaskSpec
15
+
16
+
17
+ def _benchmark_script(task: TaskSpec) -> str:
18
+ return f"""import json
19
+ import time
20
+ from candidate import {task.benchmark_entrypoint}
21
+
22
+ {task.benchmark_builder}
23
+
24
+ events = build_benchmark_events()
25
+ start = time.perf_counter()
26
+ for _ in range({task.benchmark_repeats}):
27
+ result = {task.benchmark_entrypoint}(events)
28
+ elapsed = time.perf_counter() - start
29
+ Path = __import__("pathlib").Path
30
+ Path("benchmark.json").write_text(json.dumps({{"elapsed": elapsed, "rows": len(result)}}), encoding="utf-8")
31
+ """
32
+
33
+
34
+ def benchmark_runtime(candidate_code: str, task: TaskSpec) -> tuple[float, bool, str]:
35
+ assert task.benchmark_entrypoint is not None
36
+ try:
37
+ with tempfile.TemporaryDirectory(prefix="python-code-review-bench-") as temp_dir:
38
+ temp_path = Path(temp_dir)
39
+ (temp_path / "candidate.py").write_text(candidate_code, encoding="utf-8")
40
+ (temp_path / "starter.py").write_text(task.starter_code, encoding="utf-8")
41
+ (temp_path / "candidate_runner.py").write_text(_benchmark_script(task), encoding="utf-8")
42
+ starter_script = _benchmark_script(task).replace("from candidate import", "from starter import")
43
+ (temp_path / "starter_runner.py").write_text(starter_script, encoding="utf-8")
44
+
45
+ try:
46
+ starter_run = subprocess.run(
47
+ [sys.executable, "starter_runner.py"],
48
+ cwd=temp_path,
49
+ capture_output=True,
50
+ text=True,
51
+ timeout=task.benchmark_timeout_s,
52
+ check=False,
53
+ )
54
+ starter_payload = json.loads((temp_path / "benchmark.json").read_text(encoding="utf-8"))
55
+ candidate_run = subprocess.run(
56
+ [sys.executable, "candidate_runner.py"],
57
+ cwd=temp_path,
58
+ capture_output=True,
59
+ text=True,
60
+ timeout=task.benchmark_timeout_s,
61
+ check=False,
62
+ )
63
+ candidate_payload = json.loads((temp_path / "benchmark.json").read_text(encoding="utf-8"))
64
+ except subprocess.TimeoutExpired as exc:
65
+ output = (exc.stdout or "") + (exc.stderr or "")
66
+ return 0.0, True, (output or "benchmark timed out").strip()
67
+ except Exception as exc:
68
+ return 0.0, False, str(exc)
69
+
70
+ starter_elapsed = max(float(starter_payload["elapsed"]), 1e-9)
71
+ candidate_elapsed = max(float(candidate_payload["elapsed"]), 1e-9)
72
+ speedup = starter_elapsed / candidate_elapsed
73
+ runtime_score = clamp_score(min((speedup - 1.0) / 3.0, 1.0))
74
+ output = "\n".join(
75
+ part
76
+ for part in [
77
+ starter_run.stdout.strip(),
78
+ starter_run.stderr.strip(),
79
+ candidate_run.stdout.strip(),
80
+ candidate_run.stderr.strip(),
81
+ f"starter={starter_elapsed:.6f}s candidate={candidate_elapsed:.6f}s speedup={speedup:.2f}x",
82
+ ]
83
+ if part
84
+ )
85
+ return runtime_score, False, output
86
+ except Exception as exc:
87
+ return 0.0, False, str(exc)
88
+
89
+
90
+ def ast_quality_score(code: str, task: TaskSpec) -> float:
91
+ tree, _ = compile_tree(code)
92
+ if tree is None:
93
+ return 0.0
94
+ import ast
95
+
96
+ function_node = next((node for node in tree.body if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef))), None)
97
+ docstring_points = 0.2 if function_node and ast.get_docstring(function_node, clean=False) else 0.0
98
+ nested_points = 0.4 if nested_loop_depth(tree) <= 1 else 0.0
99
+ marker_points = 0.0
100
+ for marker in task.expected_quality_markers:
101
+ if marker in code:
102
+ marker_points += 0.2
103
+ return clamp_score(docstring_points + nested_points + marker_points)
104
+
105
+
106
+ def grade_optimization_task(candidate_code: str, task: TaskSpec) -> TaskGrade:
107
+ execution = run_pytest_suite(candidate_code, [*task.visible_tests, *task.hidden_tests], timeout_s=task.benchmark_timeout_s)
108
+ test_fraction = execution.passed / execution.total if execution.total else 0.0
109
+
110
+ if execution.timed_out:
111
+ return TaskGrade(score=0.0, tests_passed=execution.passed, tests_total=execution.total, timed_out=True, details={"tests": execution.output})
112
+
113
+ runtime_score, timed_out, benchmark_output = benchmark_runtime(candidate_code, task)
114
+ if timed_out:
115
+ return TaskGrade(score=0.0, tests_passed=execution.passed, tests_total=execution.total, timed_out=True, details={"tests": execution.output, "benchmark": benchmark_output})
116
+
117
+ quality_score = ast_quality_score(candidate_code, task)
118
+ pep8_score = style_score(candidate_code, task.style_max_line_length)
119
+ score = clamp_score((0.5 * test_fraction) + (0.3 * runtime_score) + (0.15 * quality_score) + (0.05 * pep8_score))
120
+ return TaskGrade(
121
+ score=score,
122
+ syntax_score=1.0,
123
+ tests_passed=execution.passed,
124
+ tests_total=execution.total,
125
+ quality_score=quality_score,
126
+ runtime_score=runtime_score,
127
+ details={
128
+ "tests": execution.output,
129
+ "benchmark": benchmark_output,
130
+ "test_fraction": round(test_fraction, 4),
131
+ "runtime_score": round(runtime_score, 4),
132
+ "style_score": round(pep8_score, 4),
133
+ },
134
+ )
135
+
pytest-cache-files-1f62ra1g/container_sim/server/graders/pytest_runner.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Helpers for deterministic pytest execution in temp sandboxes."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import subprocess
7
+ import sys
8
+ import tempfile
9
+ from dataclasses import dataclass
10
+ from pathlib import Path
11
+ from typing import Iterable
12
+
13
+
14
+ @dataclass(frozen=True)
15
+ class PytestExecution:
16
+ passed: int
17
+ failed: int
18
+ total: int
19
+ timed_out: bool
20
+ output: str
21
+
22
+
23
+ def _test_module_source(tests: Iterable[str]) -> str:
24
+ blocks: list[str] = ["from candidate import * # noqa: F401,F403"]
25
+ for index, test in enumerate(tests, start=1):
26
+ snippet = str(test).strip()
27
+ if not snippet:
28
+ continue
29
+ if snippet.startswith("def test_"):
30
+ blocks.append(snippet)
31
+ continue
32
+ blocks.append(
33
+ "\n".join(
34
+ [
35
+ f"def test_case_{index:03d}():",
36
+ f" assert {snippet}",
37
+ ]
38
+ )
39
+ )
40
+ return "\n\n".join(blocks) or "def test_placeholder():\n assert True\n"
41
+
42
+
43
+ def _runner_script() -> str:
44
+ return """import json
45
+ import pathlib
46
+ import pytest
47
+
48
+
49
+ class Collector:
50
+ def __init__(self) -> None:
51
+ self.passed = 0
52
+ self.failed = 0
53
+
54
+ def pytest_runtest_logreport(self, report):
55
+ if report.when != "call":
56
+ return
57
+ if report.passed:
58
+ self.passed += 1
59
+ elif report.failed:
60
+ self.failed += 1
61
+
62
+
63
+ collector = Collector()
64
+ exit_code = pytest.main(["-q", "test_candidate.py"], plugins=[collector])
65
+ payload = {
66
+ "passed": collector.passed,
67
+ "failed": collector.failed,
68
+ "exit_code": int(exit_code),
69
+ }
70
+ pathlib.Path("pytest_results.json").write_text(json.dumps(payload), encoding="utf-8")
71
+ """
72
+
73
+
74
+ def run_pytest_suite(candidate_code: str, tests: Iterable[str], timeout_s: float = 3.0) -> PytestExecution:
75
+ test_cases = list(tests)
76
+ try:
77
+ with tempfile.TemporaryDirectory(prefix="python-code-review-") as temp_dir:
78
+ temp_path = Path(temp_dir)
79
+ (temp_path / "candidate.py").write_text(candidate_code, encoding="utf-8")
80
+ (temp_path / "test_candidate.py").write_text(_test_module_source(test_cases), encoding="utf-8")
81
+ (temp_path / "runner.py").write_text(_runner_script(), encoding="utf-8")
82
+
83
+ try:
84
+ completed = subprocess.run(
85
+ [sys.executable, "runner.py"],
86
+ cwd=temp_path,
87
+ capture_output=True,
88
+ text=True,
89
+ timeout=timeout_s,
90
+ check=False,
91
+ )
92
+ except subprocess.TimeoutExpired as exc:
93
+ output = (exc.stdout or "") + (exc.stderr or "")
94
+ return PytestExecution(
95
+ passed=0,
96
+ failed=max(len(test_cases), 1),
97
+ total=max(len(test_cases), 1),
98
+ timed_out=True,
99
+ output=(output or "pytest timed out").strip(),
100
+ )
101
+
102
+ result_path = temp_path / "pytest_results.json"
103
+ if not result_path.exists():
104
+ output = (completed.stdout or "") + (completed.stderr or "")
105
+ total = max(len(test_cases), 1)
106
+ return PytestExecution(0, total, total, False, output.strip())
107
+
108
+ try:
109
+ payload = json.loads(result_path.read_text(encoding="utf-8"))
110
+ except Exception as exc:
111
+ output = ((completed.stdout or "") + (completed.stderr or "")).strip()
112
+ return PytestExecution(0, max(len(test_cases), 1), max(len(test_cases), 1), False, (output or str(exc)).strip())
113
+
114
+ passed = int(payload.get("passed", 0))
115
+ failed = int(payload.get("failed", 0))
116
+ total = max(passed + failed, len(test_cases))
117
+ output = ((completed.stdout or "") + (completed.stderr or "")).strip()
118
+ return PytestExecution(passed, failed, total, False, output)
119
+ except Exception as exc:
120
+ return PytestExecution(0, max(len(test_cases), 1), max(len(test_cases), 1), False, str(exc))
121
+
pytest-cache-files-1f62ra1g/container_sim/server/graders/syntax.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Task graders for syntax and bug-fix tasks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .common import clamp_score, compiles, normalized_diff_score, style_score, syntax_error_message
6
+ from .optimization import grade_optimization_task
7
+ from .pytest_runner import run_pytest_suite
8
+ from ..models import TaskGrade
9
+ from ..tasks.task_bank import TaskSpec
10
+
11
+
12
+ def grade_syntax_task(candidate_code: str, task: TaskSpec) -> TaskGrade:
13
+ error = syntax_error_message(candidate_code)
14
+ diff_score = normalized_diff_score(candidate_code, task.reference_code)
15
+ style_base = style_score(candidate_code, task.style_max_line_length)
16
+ if not error:
17
+ return TaskGrade(score=1.0, syntax_score=1.0, quality_score=style_base, details={"compile_error": ""})
18
+ partial = clamp_score(0.15 + (0.55 * diff_score))
19
+ return TaskGrade(score=partial, syntax_score=0.0, quality_score=diff_score * style_base, details={"compile_error": error})
20
+
21
+
22
+ def grade_bug_fix_task(candidate_code: str, task: TaskSpec, include_hidden: bool = True) -> TaskGrade:
23
+ if not compiles(candidate_code):
24
+ error = syntax_error_message(candidate_code)
25
+ return TaskGrade(score=0.0, syntax_score=0.0, details={"compile_error": error})
26
+
27
+ tests = list(task.visible_tests)
28
+ if include_hidden:
29
+ tests.extend(task.hidden_tests)
30
+
31
+ execution = run_pytest_suite(candidate_code, tests, timeout_s=3.0)
32
+ if execution.timed_out:
33
+ return TaskGrade(
34
+ score=0.0,
35
+ syntax_score=1.0,
36
+ tests_passed=execution.passed,
37
+ tests_total=execution.total,
38
+ timed_out=True,
39
+ details={"compile_error": "", "tests": execution.output},
40
+ )
41
+
42
+ pass_fraction = execution.passed / execution.total if execution.total else 0.0
43
+ quality = style_score(candidate_code, task.style_max_line_length)
44
+ return TaskGrade(
45
+ score=clamp_score(pass_fraction),
46
+ syntax_score=1.0,
47
+ tests_passed=execution.passed,
48
+ tests_total=execution.total,
49
+ quality_score=quality,
50
+ details={"compile_error": "", "tests": execution.output},
51
+ )
52
+
53
+
54
+ def grade_task(candidate_code: str, task: TaskSpec, include_hidden: bool = True) -> TaskGrade:
55
+ if task.task_kind == "syntax_fix":
56
+ return grade_syntax_task(candidate_code, task)
57
+ if task.task_kind == "bug_fix":
58
+ return grade_bug_fix_task(candidate_code, task, include_hidden=include_hidden)
59
+ return grade_optimization_task(candidate_code, task)
60
+
pytest-cache-files-1f62ra1g/container_sim/server/grading.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Deterministic grading helpers for PR-review tasks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from dataclasses import dataclass
7
+ from typing import Iterable, List, Optional, Sequence, Set
8
+
9
+ try:
10
+ from models import ReviewFinding, TaskGrade
11
+ from server.task_bank import RubricIssue, TaskSpec
12
+ except ModuleNotFoundError: # pragma: no cover
13
+ from ..models import ReviewFinding, TaskGrade
14
+ from .task_bank import RubricIssue, TaskSpec
15
+
16
+
17
+ FALSE_POSITIVE_PENALTY = 0.10
18
+ DUPLICATE_PENALTY = 0.05
19
+
20
+
21
+ @dataclass(frozen=True)
22
+ class FindingMatch:
23
+ """Result of matching one finding against the rubric."""
24
+
25
+ issue_id: Optional[str]
26
+ duplicate: bool = False
27
+
28
+
29
+ def finding_fingerprint(finding: ReviewFinding) -> str:
30
+ """Build a deterministic fingerprint for duplicate detection."""
31
+
32
+ text = " ".join(
33
+ [
34
+ finding.file_path,
35
+ str(finding.line or 0),
36
+ finding.category,
37
+ finding.severity,
38
+ finding.title,
39
+ finding.explanation,
40
+ finding.suggested_fix,
41
+ ]
42
+ )
43
+ return "|".join(sorted(tokens(text)))
44
+
45
+
46
+ def match_finding(
47
+ finding: ReviewFinding,
48
+ task: TaskSpec,
49
+ matched_issue_ids: Set[str],
50
+ seen_fingerprints: Set[str],
51
+ ) -> FindingMatch:
52
+ """Match one finding against the remaining rubric issues."""
53
+
54
+ fingerprint = finding_fingerprint(finding)
55
+ if fingerprint in seen_fingerprints:
56
+ return FindingMatch(issue_id=None, duplicate=True)
57
+
58
+ for issue in task.rubric_issues:
59
+ if issue.issue_id in matched_issue_ids:
60
+ continue
61
+ if finding_matches_issue(finding, issue):
62
+ return FindingMatch(issue_id=issue.issue_id)
63
+ return FindingMatch(issue_id=None)
64
+
65
+
66
+ def finding_matches_issue(finding: ReviewFinding, issue: RubricIssue) -> bool:
67
+ """Return True when a finding deterministically matches a rubric issue."""
68
+
69
+ if finding.file_path != issue.file_path:
70
+ return False
71
+ if finding.category != issue.category:
72
+ return False
73
+ if finding.severity != issue.severity:
74
+ return False
75
+ if finding.line is None or abs(finding.line - issue.line) > 2:
76
+ return False
77
+
78
+ finding_tokens = tokens(
79
+ " ".join([finding.title, finding.explanation, finding.suggested_fix])
80
+ )
81
+ keyword_hits = sum(1 for keyword in issue.keywords if keyword in finding_tokens)
82
+ return keyword_hits >= issue.min_keyword_hits
83
+
84
+
85
+ def score_task(
86
+ task: TaskSpec,
87
+ matched_issue_ids: Iterable[str],
88
+ false_positives: int = 0,
89
+ duplicate_findings: int = 0,
90
+ ) -> TaskGrade:
91
+ """Score a task from cumulative episode state."""
92
+
93
+ matched_set = set(matched_issue_ids)
94
+ matched_weight = sum(
95
+ issue.weight for issue in task.rubric_issues if issue.issue_id in matched_set
96
+ )
97
+ raw_score = matched_weight
98
+ raw_score -= false_positives * FALSE_POSITIVE_PENALTY
99
+ raw_score -= duplicate_findings * DUPLICATE_PENALTY
100
+ score = max(0.0, min(1.0, round(raw_score, 6)))
101
+ return TaskGrade(
102
+ score=score,
103
+ matched_issue_ids=sorted(matched_set),
104
+ false_positives=false_positives,
105
+ duplicate_findings=duplicate_findings,
106
+ matched_weight=min(1.0, round(matched_weight, 6)),
107
+ )
108
+
109
+
110
+ def grade_findings(task: TaskSpec, findings: Sequence[ReviewFinding]) -> TaskGrade:
111
+ """Offline-grade a batch of findings for one task."""
112
+
113
+ matched_issue_ids: Set[str] = set()
114
+ seen_fingerprints: Set[str] = set()
115
+ false_positives = 0
116
+ duplicate_findings = 0
117
+
118
+ for finding in findings:
119
+ result = match_finding(
120
+ finding=finding,
121
+ task=task,
122
+ matched_issue_ids=matched_issue_ids,
123
+ seen_fingerprints=seen_fingerprints,
124
+ )
125
+ fingerprint = finding_fingerprint(finding)
126
+ if result.duplicate:
127
+ duplicate_findings += 1
128
+ continue
129
+ seen_fingerprints.add(fingerprint)
130
+ if result.issue_id is None:
131
+ false_positives += 1
132
+ continue
133
+ matched_issue_ids.add(result.issue_id)
134
+
135
+ return score_task(
136
+ task=task,
137
+ matched_issue_ids=matched_issue_ids,
138
+ false_positives=false_positives,
139
+ duplicate_findings=duplicate_findings,
140
+ )
141
+
142
+
143
+ def tokens(text: str) -> Set[str]:
144
+ """Normalize free text into deterministic comparison tokens."""
145
+
146
+ return set(re.findall(r"[a-z0-9_]+", text.lower()))
147
+
pytest-cache-files-1f62ra1g/container_sim/server/models.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Typed models for the self-contained server package."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Dict, List, Literal, Optional
6
+
7
+ from pydantic import BaseModel, Field
8
+
9
+ from .compat import Action, Observation, State
10
+
11
+
12
+ Difficulty = Literal["easy", "medium", "hard"]
13
+ TaskKind = Literal["syntax_fix", "bug_fix", "optimization"]
14
+ ActionType = Literal["analyze_code", "edit_code", "run_tests", "submit_solution"]
15
+ Category = Literal["bug", "security", "performance", "maintainability", "style", "testing"]
16
+ Severity = Literal["critical", "warning", "info"]
17
+
18
+
19
+ class HistoryEntry(BaseModel):
20
+ step: int = Field(..., ge=0)
21
+ action_type: ActionType
22
+ status: str
23
+ reward: float
24
+
25
+
26
+ class RewardDetails(BaseModel):
27
+ value: float
28
+ syntax_reward: float = 0.0
29
+ test_reward: float = 0.0
30
+ quality_bonus: float = 0.0
31
+ correctness_bonus: float = 0.0
32
+ progress_delta: float = 0.0
33
+ stagnation_penalty: float = 0.0
34
+ regression_penalty: float = 0.0
35
+ invalid_action_penalty: float = 0.0
36
+ timeout_penalty: float = 0.0
37
+ reason: str
38
+ prev_score: float = 0.0
39
+ curr_score: float = 0.0
40
+ code_changed: bool = False
41
+
42
+
43
+ class PythonCodeReviewAction(Action):
44
+ action_type: ActionType
45
+ code: Optional[str] = None
46
+
47
+
48
+ class PythonCodeReviewObservation(Observation):
49
+ task_id: str
50
+ title: str = ""
51
+ difficulty: Difficulty
52
+ task_kind: Optional[TaskKind] = None
53
+ task_description: str
54
+ current_code: str
55
+ errors: str
56
+ test_results: str
57
+ visible_tests: List[str] = Field(default_factory=list)
58
+ history: List[HistoryEntry] = Field(default_factory=list)
59
+ attempts_remaining: int = Field(..., ge=0)
60
+ last_action_status: str = ""
61
+ score: float = Field(..., ge=0.0, le=1.0)
62
+ reward_details: RewardDetails = Field(
63
+ default_factory=lambda: RewardDetails(value=0.0, reason="Reset")
64
+ )
65
+
66
+
67
+ class PythonCodeReviewState(State):
68
+ episode_id: str
69
+ step_count: int = Field(default=0, ge=0)
70
+ task_id: Optional[str] = None
71
+ difficulty: Optional[Difficulty] = None
72
+ task_kind: Optional[TaskKind] = None
73
+ attempts_remaining: int = Field(default=0, ge=0)
74
+ current_code: str = ""
75
+ errors: str = ""
76
+ test_results: str = ""
77
+ history: List[HistoryEntry] = Field(default_factory=list)
78
+ score: float = Field(default=0.0, ge=0.0, le=1.0)
79
+ done: bool = False
80
+
81
+
82
+ class TaskDescriptor(BaseModel):
83
+ task_id: str
84
+ title: str
85
+ difficulty: Difficulty
86
+ task_kind: Optional[TaskKind] = None
87
+ task_description: str = ""
88
+ starter_code: str = ""
89
+ visible_tests: List[str] = Field(default_factory=list)
90
+ goal: str = ""
91
+ repo_summary: str = ""
92
+ changed_files: List[str] = Field(default_factory=list)
93
+ available_files: List[str] = Field(default_factory=list)
94
+ max_steps: int = Field(..., ge=1)
95
+
96
+
97
+ class TaskSummary(BaseModel):
98
+ task_id: str
99
+ difficulty: Difficulty
100
+ title: str
101
+ goal: str = ""
102
+
103
+
104
+ class ReviewFinding(BaseModel):
105
+ title: str
106
+ file_path: str = ""
107
+ line: Optional[int] = Field(default=None, ge=1)
108
+ category: Category = "bug"
109
+ severity: Severity = "warning"
110
+ rationale: str = ""
111
+ recommendation: str = ""
112
+ rule_id: str = ""
113
+
114
+ @property
115
+ def explanation(self) -> str:
116
+ return self.rationale
117
+
118
+ @property
119
+ def suggested_fix(self) -> str:
120
+ return self.recommendation
121
+
122
+
123
+ class DirectReviewResponse(BaseModel):
124
+ issues: List[ReviewFinding] = Field(default_factory=list)
125
+ summary: str = ""
126
+ score: float = Field(default=0.0, ge=0.0, le=1.0)
127
+ improved_code: Optional[str] = None
128
+
129
+
130
+ class TaskGrade(BaseModel):
131
+ score: float = Field(..., ge=0.0, le=1.0)
132
+ syntax_score: float = Field(default=0.0, ge=0.0, le=1.0)
133
+ tests_passed: int = Field(default=0, ge=0)
134
+ tests_total: int = Field(default=0, ge=0)
135
+ quality_score: float = Field(default=0.0, ge=0.0, le=1.0)
136
+ runtime_score: float = Field(default=0.0, ge=0.0, le=1.0)
137
+ timed_out: bool = False
138
+ matched_issue_ids: List[str] = Field(default_factory=list)
139
+ false_positives: int = Field(default=0, ge=0)
140
+ duplicate_findings: int = Field(default=0, ge=0)
141
+ matched_weight: float = Field(default=0.0, ge=0.0, le=1.0)
142
+ details: Dict[str, Any] = Field(default_factory=dict)
143
+
144
+
145
+ class HealthResponse(BaseModel):
146
+ status: Literal["ok"] = "ok"
147
+ environment: str = "python_code_review_env"
148
+ task_count: int = Field(default=0, ge=0)
149
+
pytest-cache-files-1f62ra1g/container_sim/server/python_env_environment.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ """Compatibility shim for older imports."""
2
+
3
+ try:
4
+ from server.code_review_environment import PythonEnvironment
5
+ except ModuleNotFoundError: # pragma: no cover
6
+ from .code_review_environment import PythonEnvironment
7
+
8
+
9
+ __all__ = ["PythonEnvironment"]
pytest-cache-files-1f62ra1g/container_sim/server/requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ openenv-core[core]>=0.2.2
2
+ fastapi>=0.115.0
3
+ uvicorn[standard]>=0.30.0
4
+ openai>=1.40.0
5
+ pytest>=8.0.0
6
+ pydantic>=2.0.0
pytest-cache-files-1f62ra1g/container_sim/server/static_review.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Deterministic static-review helpers for arbitrary Python code.
2
+
3
+ Unlike the benchmark grader, this module does not compare against hidden rubric
4
+ items. Instead, it performs direct AST-based review on arbitrary snippets so it
5
+ can be used for manual testing, examples, and future dataset generation.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import ast
11
+ from typing import List, Optional
12
+
13
+ try:
14
+ from models import DirectReviewResponse, ReviewFinding
15
+ except ModuleNotFoundError: # pragma: no cover
16
+ from ..models import DirectReviewResponse, ReviewFinding
17
+
18
+
19
+ class _StaticAnalyzer(ast.NodeVisitor):
20
+ """AST visitor that emits structured review findings.
21
+
22
+ The visitor intentionally focuses on a small set of high-signal patterns so
23
+ the direct-review endpoint stays predictable and easy to understand.
24
+ """
25
+
26
+ def __init__(self) -> None:
27
+ self.issues: List[ReviewFinding] = []
28
+
29
+ def visit_FunctionDef(self, node: ast.FunctionDef) -> None: # noqa: N802
30
+ """Flag mutable default arguments in function definitions."""
31
+
32
+ for default in list(node.args.defaults):
33
+ if isinstance(default, (ast.List, ast.Dict, ast.Set)):
34
+ self.issues.append(
35
+ ReviewFinding(
36
+ title="Mutable default argument",
37
+ line=getattr(default, "lineno", node.lineno),
38
+ category="bug",
39
+ severity="warning",
40
+ rationale=(
41
+ "Mutable defaults persist across calls and can leak state "
42
+ "between unrelated requests."
43
+ ),
44
+ recommendation="Use None as the default and create the object inside the function.",
45
+ rule_id="mutable-default-list",
46
+ )
47
+ )
48
+ self.generic_visit(node)
49
+
50
+ def visit_Call(self, node: ast.Call) -> None: # noqa: N802
51
+ """Inspect function calls for obviously unsafe or noisy patterns."""
52
+
53
+ func_name = self._call_name(node)
54
+ if func_name in {"eval", "exec"}:
55
+ self.issues.append(
56
+ ReviewFinding(
57
+ title=f"Avoid {func_name} on untrusted input",
58
+ line=node.lineno,
59
+ category="security",
60
+ severity="critical",
61
+ rationale=(
62
+ f"{func_name} executes arbitrary code and is unsafe on "
63
+ "user-controlled input."
64
+ ),
65
+ recommendation="Use a safe parser or a whitelist-based evaluator.",
66
+ rule_id="avoid-eval" if func_name == "eval" else "avoid-exec",
67
+ )
68
+ )
69
+ if func_name.endswith("check_output") or func_name.endswith("run"):
70
+ for keyword in node.keywords:
71
+ # `shell=True` is only a problem when the command comes from a
72
+ # shell-parsed string, but this heuristic is high value for
73
+ # review and intentionally conservative.
74
+ if keyword.arg == "shell" and isinstance(keyword.value, ast.Constant) and keyword.value.value is True:
75
+ self.issues.append(
76
+ ReviewFinding(
77
+ title="shell=True with dynamic input",
78
+ line=node.lineno,
79
+ category="security",
80
+ severity="critical",
81
+ rationale=(
82
+ "shell=True executes through the shell and can allow "
83
+ "command injection when the command string is interpolated."
84
+ ),
85
+ recommendation="Pass a list of arguments and keep shell=False.",
86
+ rule_id="shell-true-command-injection",
87
+ )
88
+ )
89
+ if func_name == "print":
90
+ self.issues.append(
91
+ ReviewFinding(
92
+ title="Print statement in application logic",
93
+ line=node.lineno,
94
+ category="style",
95
+ severity="info",
96
+ rationale="Production services should prefer structured logging over print statements.",
97
+ recommendation="Use the logging module or return the value to the caller.",
98
+ rule_id="print-statement",
99
+ )
100
+ )
101
+ self.generic_visit(node)
102
+
103
+ def visit_ExceptHandler(self, node: ast.ExceptHandler) -> None: # noqa: N802
104
+ """Flag bare exception handlers that hide failures."""
105
+
106
+ if node.type is None:
107
+ self.issues.append(
108
+ ReviewFinding(
109
+ title="Bare except",
110
+ line=node.lineno,
111
+ category="maintainability",
112
+ severity="warning",
113
+ rationale="Bare except catches KeyboardInterrupt and other system-level exceptions.",
114
+ recommendation="Catch a specific exception and record the failure.",
115
+ rule_id="bare-except",
116
+ )
117
+ )
118
+ self.generic_visit(node)
119
+
120
+ def visit_For(self, node: ast.For) -> None: # noqa: N802
121
+ """Look for list-membership checks nested in loops."""
122
+
123
+ for child in ast.walk(node):
124
+ if isinstance(child, ast.Compare) and any(
125
+ isinstance(operator, (ast.In, ast.NotIn)) for operator in child.ops
126
+ ):
127
+ if isinstance(child.comparators[0], ast.Name):
128
+ self.issues.append(
129
+ ReviewFinding(
130
+ title="Potential quadratic membership check inside loop",
131
+ line=child.lineno,
132
+ category="performance",
133
+ severity="warning",
134
+ rationale=(
135
+ "Repeated membership checks against a list inside a loop "
136
+ "can degrade to quadratic runtime."
137
+ ),
138
+ recommendation="Use a set or dict for O(1) membership checks.",
139
+ rule_id="quadratic-membership-check",
140
+ )
141
+ )
142
+ break
143
+ self.generic_visit(node)
144
+
145
+ @staticmethod
146
+ def _call_name(node: ast.Call) -> str:
147
+ """Extract a dotted function name such as `subprocess.run`."""
148
+
149
+ func = node.func
150
+ if isinstance(func, ast.Name):
151
+ return func.id
152
+ if isinstance(func, ast.Attribute):
153
+ prefix = _StaticAnalyzer._attribute_prefix(func.value)
154
+ return f"{prefix}.{func.attr}" if prefix else func.attr
155
+ return ""
156
+
157
+ @staticmethod
158
+ def _attribute_prefix(node: ast.AST) -> str:
159
+ """Reconstruct the left-hand side of an attribute chain."""
160
+
161
+ if isinstance(node, ast.Name):
162
+ return node.id
163
+ if isinstance(node, ast.Attribute):
164
+ prefix = _StaticAnalyzer._attribute_prefix(node.value)
165
+ return f"{prefix}.{node.attr}" if prefix else node.attr
166
+ return ""
167
+
168
+
169
+ def analyze_python_code(code: str) -> List[ReviewFinding]:
170
+ """Analyze arbitrary Python code and return structured findings."""
171
+
172
+ if not code.strip():
173
+ return [
174
+ ReviewFinding(
175
+ title="No code provided",
176
+ category="bug",
177
+ severity="warning",
178
+ rationale="The reviewer cannot inspect an empty submission.",
179
+ recommendation="Provide Python source code.",
180
+ rule_id="empty-input",
181
+ )
182
+ ]
183
+
184
+ # Syntax errors are turned into findings rather than exceptions so API
185
+ # consumers always get a valid response shape.
186
+ try:
187
+ tree = ast.parse(code)
188
+ except SyntaxError as exc:
189
+ return [
190
+ ReviewFinding(
191
+ title="Syntax error",
192
+ line=exc.lineno,
193
+ category="bug",
194
+ severity="critical",
195
+ rationale=exc.msg,
196
+ recommendation="Fix the syntax error before running static review.",
197
+ rule_id="syntax-error",
198
+ )
199
+ ]
200
+
201
+ analyzer = _StaticAnalyzer()
202
+ analyzer.visit(tree)
203
+ return _deduplicate(analyzer.issues)
204
+
205
+
206
+ def build_direct_review_response(
207
+ code: str, context: Optional[str] = None
208
+ ) -> DirectReviewResponse:
209
+ """Build the public direct-review response for the `/review` route."""
210
+
211
+ issues = analyze_python_code(code)
212
+ weighted_penalty = 0.0
213
+ # The direct-review score is intentionally simple: more severe issues lower
214
+ # the score more aggressively.
215
+ for issue in issues:
216
+ if issue.severity == "critical":
217
+ weighted_penalty += 0.3
218
+ elif issue.severity == "warning":
219
+ weighted_penalty += 0.15
220
+ else:
221
+ weighted_penalty += 0.05
222
+
223
+ score = max(0.0, min(1.0, 1.0 - weighted_penalty))
224
+ summary = _build_summary(issues, context)
225
+ improved_code = _suggest_improved_code(code, issues)
226
+ return DirectReviewResponse(
227
+ issues=issues,
228
+ summary=summary,
229
+ score=score,
230
+ improved_code=improved_code,
231
+ )
232
+
233
+
234
+ def _build_summary(issues: List[ReviewFinding], context: Optional[str]) -> str:
235
+ """Create a concise human-readable summary for the direct-review response."""
236
+
237
+ if not issues:
238
+ base = "No obvious issues were detected by the deterministic reviewer."
239
+ else:
240
+ critical = sum(1 for issue in issues if issue.severity == "critical")
241
+ warnings = sum(1 for issue in issues if issue.severity == "warning")
242
+ infos = sum(1 for issue in issues if issue.severity == "info")
243
+ base = (
244
+ f"Detected {len(issues)} issue(s): {critical} critical, "
245
+ f"{warnings} warning, {infos} info."
246
+ )
247
+ if context:
248
+ return f"{base} Context: {context}"
249
+ return base
250
+
251
+
252
+ def _suggest_improved_code(code: str, issues: List[ReviewFinding]) -> Optional[str]:
253
+ """Append high-level fix directions to the submitted code."""
254
+
255
+ if not issues:
256
+ return None
257
+ suggestions = [issue.recommendation for issue in issues if issue.recommendation]
258
+ comment = " | ".join(dict.fromkeys(suggestions))
259
+ return f"{code.rstrip()}\n\n# Suggested review directions: {comment}"
260
+
261
+
262
+ def _deduplicate(findings: List[ReviewFinding]) -> List[ReviewFinding]:
263
+ """Drop duplicate findings that refer to the same rule and line."""
264
+
265
+ seen = set()
266
+ unique: List[ReviewFinding] = []
267
+ for finding in findings:
268
+ key = (finding.rule_id, finding.line, finding.category)
269
+ if key in seen:
270
+ continue
271
+ seen.add(key)
272
+ unique.append(finding)
273
+ return unique
pytest-cache-files-1f62ra1g/container_sim/server/task_bank.py ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Static PR-review tasks and hidden grading rubrics."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Dict, Iterable, List, Sequence
7
+
8
+ try:
9
+ from models import Category, Difficulty, Severity, TaskDescriptor, TaskSummary
10
+ except ModuleNotFoundError: # pragma: no cover
11
+ from ..models import Category, Difficulty, Severity, TaskDescriptor, TaskSummary
12
+
13
+
14
+ @dataclass(frozen=True)
15
+ class RubricIssue:
16
+ """One hidden issue that can be matched by the deterministic grader."""
17
+
18
+ issue_id: str
19
+ file_path: str
20
+ line: int
21
+ category: Category
22
+ severity: Severity
23
+ keywords: Sequence[str]
24
+ min_keyword_hits: int
25
+ weight: float
26
+
27
+
28
+ @dataclass(frozen=True)
29
+ class TaskSpec:
30
+ """Complete task definition, including hidden rubric metadata."""
31
+
32
+ task_id: str
33
+ difficulty: Difficulty
34
+ title: str
35
+ goal: str
36
+ repo_summary: str
37
+ visible_diff: str
38
+ file_contents: Dict[str, str]
39
+ changed_files: Sequence[str]
40
+ rubric_issues: Sequence[RubricIssue]
41
+ max_steps: int
42
+
43
+ @property
44
+ def available_files(self) -> List[str]:
45
+ return list(self.file_contents.keys())
46
+
47
+ def to_descriptor(self) -> TaskDescriptor:
48
+ return TaskDescriptor(
49
+ task_id=self.task_id,
50
+ difficulty=self.difficulty,
51
+ title=self.title,
52
+ goal=self.goal,
53
+ repo_summary=self.repo_summary,
54
+ changed_files=list(self.changed_files),
55
+ available_files=self.available_files,
56
+ max_steps=self.max_steps,
57
+ )
58
+
59
+ def to_summary(self) -> TaskSummary:
60
+ return TaskSummary(
61
+ task_id=self.task_id,
62
+ difficulty=self.difficulty,
63
+ title=self.title,
64
+ goal=self.goal,
65
+ )
66
+
67
+
68
+ TASKS: List[TaskSpec] = [
69
+ TaskSpec(
70
+ task_id="py-pr-review-easy",
71
+ difficulty="easy",
72
+ title="Retry Delay Regression",
73
+ goal=(
74
+ "Review the pull request and identify the real bug introduced in the retry "
75
+ "delay helper before it ships."
76
+ ),
77
+ repo_summary=(
78
+ "This service computes retry delays for background notification delivery. "
79
+ "The change is intended to relax validation for legacy callers."
80
+ ),
81
+ visible_diff="\n".join(
82
+ [
83
+ "diff --git a/src/notifications/retry.py b/src/notifications/retry.py",
84
+ "@@",
85
+ "- if base_delay <= 0:",
86
+ "+ if base_delay < 0:",
87
+ " return 0.0",
88
+ ]
89
+ ),
90
+ file_contents={
91
+ "src/notifications/retry.py": "\n".join(
92
+ [
93
+ "from __future__ import annotations",
94
+ "",
95
+ "def calculate_retry_delay(attempt: int, base_delay: float = 2.0) -> float:",
96
+ ' """Return the retry delay in seconds."""',
97
+ " if attempt < 0:",
98
+ ' raise ValueError(\"attempt must be >= 0\")',
99
+ " if base_delay < 0:",
100
+ " return 0.0",
101
+ " return attempt / base_delay",
102
+ ]
103
+ )
104
+ },
105
+ changed_files=("src/notifications/retry.py",),
106
+ rubric_issues=(
107
+ RubricIssue(
108
+ issue_id="zero-base-delay-divides",
109
+ file_path="src/notifications/retry.py",
110
+ line=7,
111
+ category="bug",
112
+ severity="warning",
113
+ keywords=("zero", "division", "base_delay"),
114
+ min_keyword_hits=2,
115
+ weight=1.0,
116
+ ),
117
+ ),
118
+ max_steps=4,
119
+ ),
120
+ TaskSpec(
121
+ task_id="py-pr-review-medium",
122
+ difficulty="medium",
123
+ title="Coupon Billing Rollout",
124
+ goal=(
125
+ "Review the billing change and identify both the production regression and "
126
+ "the missing coverage that would have caught it."
127
+ ),
128
+ repo_summary=(
129
+ "The billing service is adding coupon support for one-off invoices. The PR "
130
+ "touches both the service code and its unit tests."
131
+ ),
132
+ visible_diff="\n".join(
133
+ [
134
+ "diff --git a/app/billing/invoice_service.py b/app/billing/invoice_service.py",
135
+ "@@",
136
+ " def charge_invoice(order: dict, gateway: Gateway) -> str:",
137
+ "- return gateway.charge(order[\"customer_id\"], order[\"amount_cents\"])",
138
+ "+ total = order[\"amount_cents\"]",
139
+ "+ coupon = order.get(\"coupon_code\")",
140
+ "+ if coupon:",
141
+ "+ discount = gateway.lookup_discount(coupon)",
142
+ "+ total = max(total - discount, 0)",
143
+ "+ return gateway.charge(order[\"customer_id\"], order[\"amount_cents\"])",
144
+ "",
145
+ "diff --git a/tests/test_invoice_service.py b/tests/test_invoice_service.py",
146
+ "@@",
147
+ " class FakeGateway:",
148
+ "+ def lookup_discount(self, coupon: str) -> int:",
149
+ "+ return 250",
150
+ ]
151
+ ),
152
+ file_contents={
153
+ "app/billing/invoice_service.py": "\n".join(
154
+ [
155
+ "from gateway import Gateway",
156
+ "",
157
+ "def charge_invoice(order: dict, gateway: Gateway) -> str:",
158
+ ' total = order["amount_cents"]',
159
+ ' coupon = order.get("coupon_code")',
160
+ " if coupon:",
161
+ " discount = gateway.lookup_discount(coupon)",
162
+ " total = max(total - discount, 0)",
163
+ ' return gateway.charge(order["customer_id"], order["amount_cents"])',
164
+ ]
165
+ ),
166
+ "tests/test_invoice_service.py": "\n".join(
167
+ [
168
+ "from app.billing.invoice_service import charge_invoice",
169
+ "",
170
+ "class FakeGateway:",
171
+ " def lookup_discount(self, coupon: str) -> int:",
172
+ " return 250",
173
+ "",
174
+ " def charge(self, customer_id: str, amount_cents: int) -> str:",
175
+ " self.last_charge = (customer_id, amount_cents)",
176
+ ' return "charge_123"',
177
+ "",
178
+ "def test_charge_invoice_without_coupon():",
179
+ " gateway = FakeGateway()",
180
+ ' charge_invoice({"customer_id": "cus_1", "amount_cents": 1000}, gateway)',
181
+ ' assert gateway.last_charge == ("cus_1", 1000)',
182
+ ]
183
+ ),
184
+ },
185
+ changed_files=("app/billing/invoice_service.py", "tests/test_invoice_service.py"),
186
+ rubric_issues=(
187
+ RubricIssue(
188
+ issue_id="discount-total-unused",
189
+ file_path="app/billing/invoice_service.py",
190
+ line=8,
191
+ category="bug",
192
+ severity="warning",
193
+ keywords=("discount", "total", "charge", "amount"),
194
+ min_keyword_hits=2,
195
+ weight=0.6,
196
+ ),
197
+ RubricIssue(
198
+ issue_id="missing-coupon-test",
199
+ file_path="tests/test_invoice_service.py",
200
+ line=11,
201
+ category="testing",
202
+ severity="warning",
203
+ keywords=("missing", "test", "coupon", "discount"),
204
+ min_keyword_hits=2,
205
+ weight=0.4,
206
+ ),
207
+ ),
208
+ max_steps=5,
209
+ ),
210
+ TaskSpec(
211
+ task_id="py-pr-review-hard",
212
+ difficulty="hard",
213
+ title="Async Job Runner Deduplication",
214
+ goal=(
215
+ "Review the async job-runner PR and find the subtle concurrency issues "
216
+ "without inventing extra problems."
217
+ ),
218
+ repo_summary=(
219
+ "A shared webhook backfill service is deduplicating in-flight work with an "
220
+ "async task cache and writing the latest result for operators to inspect."
221
+ ),
222
+ visible_diff="\n".join(
223
+ [
224
+ "diff --git a/app/jobs/runner.py b/app/jobs/runner.py",
225
+ "@@",
226
+ " async def run_job(job_id: str, payload: dict, worker) -> str:",
227
+ " if job_id in ACTIVE_RUNS:",
228
+ " return await ACTIVE_RUNS[job_id]",
229
+ "+ lock = asyncio.Lock()",
230
+ "+ async with lock:",
231
+ "+ task = asyncio.create_task(worker.run(payload))",
232
+ "+ ACTIVE_RUNS[job_id] = task",
233
+ " try:",
234
+ " result = await task",
235
+ " finally:",
236
+ " ACTIVE_RUNS.pop(job_id, None)",
237
+ "+ Path(\"latest-result.json\").write_text(result)",
238
+ " return result",
239
+ ]
240
+ ),
241
+ file_contents={
242
+ "app/jobs/runner.py": "\n".join(
243
+ [
244
+ "import asyncio",
245
+ "from pathlib import Path",
246
+ "",
247
+ "ACTIVE_RUNS: dict[str, asyncio.Task[str]] = {}",
248
+ "",
249
+ "async def run_job(job_id: str, payload: dict, worker) -> str:",
250
+ " if job_id in ACTIVE_RUNS:",
251
+ " return await ACTIVE_RUNS[job_id]",
252
+ "",
253
+ " lock = asyncio.Lock()",
254
+ " async with lock:",
255
+ " task = asyncio.create_task(worker.run(payload))",
256
+ " ACTIVE_RUNS[job_id] = task",
257
+ " try:",
258
+ " result = await task",
259
+ " finally:",
260
+ " ACTIVE_RUNS.pop(job_id, None)",
261
+ "",
262
+ ' Path("latest-result.json").write_text(result)',
263
+ " return result",
264
+ ]
265
+ ),
266
+ "tests/test_runner.py": "\n".join(
267
+ [
268
+ "import pytest",
269
+ "",
270
+ "from app.jobs.runner import run_job",
271
+ "",
272
+ "class FakeWorker:",
273
+ " async def run(self, payload: dict) -> str:",
274
+ ' return payload["job_id"]',
275
+ "",
276
+ "@pytest.mark.asyncio",
277
+ "async def test_run_job_returns_worker_result():",
278
+ " worker = FakeWorker()",
279
+ ' result = await run_job("job-1", {"job_id": "job-1"}, worker)',
280
+ ' assert result == "job-1"',
281
+ ]
282
+ ),
283
+ },
284
+ changed_files=("app/jobs/runner.py", "tests/test_runner.py"),
285
+ rubric_issues=(
286
+ RubricIssue(
287
+ issue_id="per-call-lock-race",
288
+ file_path="app/jobs/runner.py",
289
+ line=9,
290
+ category="bug",
291
+ severity="warning",
292
+ keywords=("lock", "race", "concurrent", "duplicate"),
293
+ min_keyword_hits=2,
294
+ weight=0.55,
295
+ ),
296
+ RubricIssue(
297
+ issue_id="shared-output-file-race",
298
+ file_path="app/jobs/runner.py",
299
+ line=18,
300
+ category="maintainability",
301
+ severity="warning",
302
+ keywords=("latest", "result", "file", "concurrent", "overwrite"),
303
+ min_keyword_hits=2,
304
+ weight=0.45,
305
+ ),
306
+ ),
307
+ max_steps=6,
308
+ ),
309
+ ]
310
+
311
+
312
+ TASKS_BY_ID: Dict[str, TaskSpec] = {task.task_id: task for task in TASKS}
313
+
314
+
315
+ def list_task_descriptors() -> List[TaskDescriptor]:
316
+ """Return public descriptors for all tasks."""
317
+
318
+ return [task.to_descriptor() for task in TASKS]
319
+
320
+
321
+ def list_task_summaries() -> List[TaskSummary]:
322
+ """Return task summaries for lightweight route responses."""
323
+
324
+ return [task.to_summary() for task in TASKS]
325
+
326
+
327
+ def get_task(task_id: str) -> TaskSpec:
328
+ """Return a task by id."""
329
+
330
+ try:
331
+ return TASKS_BY_ID[task_id]
332
+ except KeyError as exc: # pragma: no cover
333
+ raise ValueError(f"Unknown task_id: {task_id}") from exc
334
+
335
+
336
+ def task_ids() -> Iterable[str]:
337
+ """Return task ids in benchmark order."""
338
+
339
+ return [task.task_id for task in TASKS]
340
+
pytest-cache-files-1f62ra1g/container_sim/server/tasks/__init__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Self-contained task definitions for container builds."""
2
+
3
+ from .task_bank import TaskSpec, get_task, list_task_descriptors, list_task_summaries, task_ids
4
+
5
+ __all__ = [
6
+ "TaskSpec",
7
+ "get_task",
8
+ "list_task_descriptors",
9
+ "list_task_summaries",
10
+ "task_ids",
11
+ ]
12
+
pytest-cache-files-1f62ra1g/container_sim/server/tasks/task_bank.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Deterministic task bank for self-contained server builds."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Dict, List, Optional
7
+
8
+ from ..models import Difficulty, TaskDescriptor, TaskKind
9
+
10
+
11
+ @dataclass(frozen=True)
12
+ class TaskSpec:
13
+ task_id: str
14
+ title: str
15
+ difficulty: Difficulty
16
+ task_kind: TaskKind
17
+ task_description: str
18
+ starter_code: str
19
+ reference_code: str
20
+ visible_tests: List[str]
21
+ hidden_tests: List[str]
22
+ max_steps: int = 10
23
+ benchmark_entrypoint: Optional[str] = None
24
+ benchmark_builder: Optional[str] = None
25
+ benchmark_repeats: int = 1
26
+ benchmark_timeout_s: float = 2.0
27
+ style_max_line_length: int = 88
28
+ expected_quality_markers: List[str] = field(default_factory=list)
29
+
30
+ def to_descriptor(self) -> TaskDescriptor:
31
+ return TaskDescriptor(
32
+ task_id=self.task_id,
33
+ title=self.title,
34
+ difficulty=self.difficulty,
35
+ task_kind=self.task_kind,
36
+ task_description=self.task_description,
37
+ starter_code=self.starter_code,
38
+ visible_tests=list(self.visible_tests),
39
+ max_steps=self.max_steps,
40
+ )
41
+
42
+
43
+ TASK_SYNTAX_FIX = TaskSpec(
44
+ task_id="syntax-fix-easy",
45
+ title="Fix a syntax-broken username normalizer",
46
+ difficulty="easy",
47
+ task_kind="syntax_fix",
48
+ task_description=(
49
+ "You are reviewing a utility function before merge. The submitted patch left "
50
+ "the function with syntax errors. Repair the code so it compiles and preserves "
51
+ "the intended behavior of trimming, lowercasing, and replacing spaces with underscores."
52
+ ),
53
+ starter_code='''def normalize_username(raw_name: str) -> str:
54
+ cleaned = raw_name.strip().lower(
55
+ if not cleaned:
56
+ return "anonymous"
57
+ return cleaned.replace(" ", "_")
58
+ ''',
59
+ reference_code='''def normalize_username(raw_name: str) -> str:
60
+ cleaned = raw_name.strip().lower()
61
+ if not cleaned:
62
+ return "anonymous"
63
+ return cleaned.replace(" ", "_")
64
+ ''',
65
+ visible_tests=[
66
+ "normalize_username(' Alice Smith ') == 'alice_smith'",
67
+ "normalize_username(' ') == 'anonymous'",
68
+ "normalize_username('Bob') == 'bob'",
69
+ ],
70
+ hidden_tests=[
71
+ "normalize_username(' HELLO WORLD ') == 'hello_world'",
72
+ "normalize_username('') == 'anonymous'",
73
+ ],
74
+ max_steps=8,
75
+ )
76
+
77
+
78
+ TASK_BUG_FIX = TaskSpec(
79
+ task_id="bug-fix-medium",
80
+ title="Repair invoice discount calculation logic",
81
+ difficulty="medium",
82
+ task_kind="bug_fix",
83
+ task_description=(
84
+ "A billing helper function is returning the wrong amount after applying discounts. "
85
+ "The function signature is correct, but the calculation logic is broken. "
86
+ "Inspect the implementation, run visible tests, and fix the bug so all tests pass. "
87
+ "Do not change the function signature or validation logic."
88
+ ),
89
+ starter_code='''from typing import Iterable
90
+
91
+
92
+ def calculate_invoice_total(line_items: Iterable[int], discount_percent: int) -> int:
93
+ """Calculate invoice total with discount applied."""
94
+ if discount_percent < 0 or discount_percent > 100:
95
+ raise ValueError("discount_percent must be between 0 and 100")
96
+
97
+ subtotal = sum(line_items)
98
+ discounted_total = subtotal - (subtotal * discount_percent // 100)
99
+ return subtotal
100
+ ''',
101
+ reference_code='''from typing import Iterable
102
+
103
+
104
+ def calculate_invoice_total(line_items: Iterable[int], discount_percent: int) -> int:
105
+ """Calculate invoice total with discount applied."""
106
+ if discount_percent < 0 or discount_percent > 100:
107
+ raise ValueError("discount_percent must be between 0 and 100")
108
+
109
+ subtotal = sum(line_items)
110
+ discounted_total = subtotal - (subtotal * discount_percent // 100)
111
+ return discounted_total
112
+ ''',
113
+ visible_tests=[
114
+ "calculate_invoice_total([1000, 2000], 0) == 3000",
115
+ "calculate_invoice_total([1000, 2000], 50) == 1500",
116
+ "calculate_invoice_total([1000], 10) == 900",
117
+ "calculate_invoice_total([], 0) == 0",
118
+ ],
119
+ hidden_tests=[
120
+ "calculate_invoice_total([100, 200, 300], 25) == 450",
121
+ "calculate_invoice_total([5000], 99) == 50",
122
+ ],
123
+ max_steps=10,
124
+ )
125
+
126
+
127
+ TASK_OPTIMIZATION = TaskSpec(
128
+ task_id="optimization-hard",
129
+ title="Optimize inefficient user activity summarization",
130
+ difficulty="hard",
131
+ task_kind="optimization",
132
+ task_description=(
133
+ "Code review found that `summarize_user_activity` is inefficient for large event streams. "
134
+ "The current implementation repeatedly scans the full event list for every user, making it O(n**2). "
135
+ "Refactor it to aggregate counts in one pass while preserving the sorted output contract. "
136
+ "Style and code quality also matter: use idiomatic Python, proper types, and clear logic. "
137
+ "All tests must pass, and the optimized version should be measurably faster."
138
+ ),
139
+ starter_code='''from typing import Iterable
140
+
141
+
142
+ def summarize_user_activity(events: Iterable[dict]) -> list[tuple[str, int]]:
143
+ """Aggregate user activity counts."""
144
+
145
+ ordered_users = []
146
+ for event in events:
147
+ user_id = event["user_id"]
148
+ if user_id not in ordered_users:
149
+ ordered_users.append(user_id)
150
+
151
+ summary = []
152
+ for user_id in ordered_users:
153
+ count = 0
154
+ for event in events:
155
+ if event["user_id"] == user_id:
156
+ count += 1
157
+ summary.append((user_id, count))
158
+ return sorted(summary, key=lambda item: (-item[1], item[0]))
159
+ ''',
160
+ reference_code='''from collections import Counter
161
+ from typing import Iterable
162
+
163
+
164
+ def summarize_user_activity(events: Iterable[dict]) -> list[tuple[str, int]]:
165
+ """Aggregate user activity counts in one pass."""
166
+
167
+ counts = Counter(event["user_id"] for event in events)
168
+ return sorted(counts.items(), key=lambda item: (-item[1], item[0]))
169
+ ''',
170
+ visible_tests=[
171
+ "summarize_user_activity([{'user_id': 'alice'}, {'user_id': 'bob'}, {'user_id': 'alice'}]) == [('alice', 2), ('bob', 1)]",
172
+ "summarize_user_activity([{'user_id': 'z'}, {'user_id': 'a'}]) == [('a', 1), ('z', 1)]",
173
+ "summarize_user_activity([]) == []",
174
+ "summarize_user_activity([{'user_id': 'solo'}]) == [('solo', 1)]",
175
+ ],
176
+ hidden_tests=[
177
+ "summarize_user_activity([{'user_id': 'u2'}, {'user_id': 'u1'}, {'user_id': 'u2'}, {'user_id': 'u2'}, {'user_id': 'u1'}]) == [('u2', 3), ('u1', 2)]",
178
+ ],
179
+ max_steps=10,
180
+ benchmark_entrypoint="summarize_user_activity",
181
+ benchmark_builder='''def build_benchmark_events():
182
+ return [{"user_id": f"user_{index % 400}"} for index in range(6000)]''',
183
+ benchmark_repeats=3,
184
+ benchmark_timeout_s=1.0,
185
+ style_max_line_length=88,
186
+ expected_quality_markers=["Counter", "sorted"],
187
+ )
188
+
189
+
190
+ TASKS: Dict[str, TaskSpec] = {
191
+ "syntax-fix-easy": TASK_SYNTAX_FIX,
192
+ "bug-fix-medium": TASK_BUG_FIX,
193
+ "optimization-hard": TASK_OPTIMIZATION,
194
+ }
195
+
196
+
197
+ def task_ids() -> List[str]:
198
+ return ["syntax-fix-easy", "bug-fix-medium", "optimization-hard"]
199
+
200
+
201
+ def get_task(task_id: str) -> TaskSpec:
202
+ if task_id not in TASKS:
203
+ raise ValueError(f"Task {task_id} not found. Available: {list(TASKS.keys())}")
204
+ return TASKS[task_id]
205
+
206
+
207
+ def list_task_descriptors() -> List[TaskDescriptor]:
208
+ return [get_task(tid).to_descriptor() for tid in task_ids()]
209
+
210
+
211
+ def list_task_summaries() -> List[TaskDescriptor]:
212
+ return list_task_descriptors()
213
+
server/app.py CHANGED
@@ -7,17 +7,27 @@ import os
7
  from fastapi import APIRouter, HTTPException
8
  from fastapi.responses import RedirectResponse
9
 
10
- from compat import create_app
11
-
12
- from models import (
13
- HealthResponse,
14
- PythonCodeReviewAction,
15
- PythonCodeReviewObservation,
16
- PythonCodeReviewState,
17
- TaskDescriptor,
18
- TaskGrade,
19
- )
20
- from server.env import PythonCodeReviewEnvironment
 
 
 
 
 
 
 
 
 
 
21
 
22
 
23
  try:
 
7
  from fastapi import APIRouter, HTTPException
8
  from fastapi.responses import RedirectResponse
9
 
10
+ try:
11
+ from compat import create_app
12
+ from models import (
13
+ HealthResponse,
14
+ PythonCodeReviewAction,
15
+ PythonCodeReviewObservation,
16
+ PythonCodeReviewState,
17
+ TaskDescriptor,
18
+ TaskGrade,
19
+ )
20
+ except Exception:
21
+ from .compat import create_app
22
+ from .models import (
23
+ HealthResponse,
24
+ PythonCodeReviewAction,
25
+ PythonCodeReviewObservation,
26
+ PythonCodeReviewState,
27
+ TaskDescriptor,
28
+ TaskGrade,
29
+ )
30
+ from server.env import PythonCodeReviewEnvironment
31
 
32
 
33
  try:
server/compat.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Compatibility helpers for OpenEnv and FastMCP runtime drift."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+ import types
7
+ from typing import Any
8
+
9
+
10
+ def install_openenv_fastmcp_compat() -> None:
11
+ """Patch FastMCP API differences so older OpenEnv builds keep importing."""
12
+ try:
13
+ import fastmcp # type: ignore
14
+ except Exception:
15
+ return
16
+
17
+ try:
18
+ if not hasattr(fastmcp, "Client"):
19
+ class CompatClient:
20
+ """Minimal async MCP client used for legacy OpenEnv imports."""
21
+
22
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
23
+ self.args = args
24
+ self.kwargs = kwargs
25
+
26
+ async def __aenter__(self) -> "CompatClient":
27
+ return self
28
+
29
+ async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> bool:
30
+ return False
31
+
32
+ async def list_tools(self) -> list[Any]:
33
+ return []
34
+
35
+ async def call_tool(self, tool_name: str, arguments: dict[str, Any]) -> Any:
36
+ raise RuntimeError(
37
+ f"MCP client compatibility mode cannot call tool: {tool_name}"
38
+ )
39
+
40
+ fastmcp.Client = CompatClient # type: ignore[attr-defined]
41
+ except Exception:
42
+ pass
43
+
44
+ try:
45
+ client_pkg = sys.modules.get("fastmcp.client")
46
+ if client_pkg is None:
47
+ client_pkg = types.ModuleType("fastmcp.client")
48
+ sys.modules["fastmcp.client"] = client_pkg
49
+
50
+ client_mod = sys.modules.get("fastmcp.client.client")
51
+ if client_mod is None:
52
+ client_mod = types.ModuleType("fastmcp.client.client")
53
+ sys.modules["fastmcp.client.client"] = client_mod
54
+
55
+ if not hasattr(client_mod, "CallToolResult"):
56
+ class CallToolResult:
57
+ """Compatibility container for legacy OpenEnv response handling."""
58
+
59
+ def __init__(
60
+ self,
61
+ content: Any = None,
62
+ structured_content: Any = None,
63
+ meta: Any = None,
64
+ data: Any = None,
65
+ is_error: bool = False,
66
+ ) -> None:
67
+ self.content = content
68
+ self.structured_content = structured_content
69
+ self.meta = meta
70
+ self.data = data
71
+ self.is_error = is_error
72
+
73
+ client_mod.CallToolResult = CallToolResult
74
+
75
+ client_pkg.client = client_mod # type: ignore[attr-defined]
76
+ except Exception:
77
+ pass
78
+
79
+
80
+ install_openenv_fastmcp_compat()
81
+
82
+
83
+ from openenv.core.env_server.http_server import create_app as openenv_create_app
84
+ from openenv.core.env_server.interfaces import Environment
85
+ from openenv.core.env_server.types import Action, Observation, State
86
+
87
+
88
+ create_app = openenv_create_app
89
+
server/env_safe.py CHANGED
@@ -5,18 +5,32 @@ from __future__ import annotations
5
  from typing import Any, Optional
6
  from uuid import uuid4
7
 
8
- from compat import Environment
9
- from graders import grade_task
10
- from models import (
11
- HealthResponse,
12
- HistoryEntry,
13
- PythonCodeReviewAction,
14
- PythonCodeReviewObservation,
15
- PythonCodeReviewState,
16
- RewardDetails,
17
- TaskGrade,
18
- )
19
- from tasks import TaskSpec, get_task as load_task, list_task_summaries, task_ids
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
 
22
  INVALID_ACTION_PENALTY = 0.10
@@ -489,4 +503,3 @@ class PythonCodeReviewEnvironment(
489
 
490
  PythonEnvironment = PythonCodeReviewEnvironment
491
  CodeReviewEnvironment = PythonCodeReviewEnvironment
492
-
 
5
  from typing import Any, Optional
6
  from uuid import uuid4
7
 
8
+ try:
9
+ from compat import Environment
10
+ from graders import grade_task
11
+ from models import (
12
+ HealthResponse,
13
+ HistoryEntry,
14
+ PythonCodeReviewAction,
15
+ PythonCodeReviewObservation,
16
+ PythonCodeReviewState,
17
+ RewardDetails,
18
+ TaskGrade,
19
+ )
20
+ from tasks import TaskSpec, get_task as load_task, list_task_summaries, task_ids
21
+ except Exception:
22
+ from .compat import Environment
23
+ from .graders import grade_task
24
+ from .models import (
25
+ HealthResponse,
26
+ HistoryEntry,
27
+ PythonCodeReviewAction,
28
+ PythonCodeReviewObservation,
29
+ PythonCodeReviewState,
30
+ RewardDetails,
31
+ TaskGrade,
32
+ )
33
+ from .tasks import TaskSpec, get_task as load_task, list_task_summaries, task_ids
34
 
35
 
36
  INVALID_ACTION_PENALTY = 0.10
 
503
 
504
  PythonEnvironment = PythonCodeReviewEnvironment
505
  CodeReviewEnvironment = PythonCodeReviewEnvironment
 
server/graders/__init__.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Deterministic graders for self-contained server builds."""
2
+
3
+ from .common import clamp_score
4
+ from .optimization import grade_optimization_task
5
+ from .pytest_runner import PytestExecution, run_pytest_suite
6
+ from .syntax import grade_bug_fix_task, grade_syntax_task, grade_task
7
+
8
+ __all__ = [
9
+ "PytestExecution",
10
+ "clamp_score",
11
+ "grade_bug_fix_task",
12
+ "grade_optimization_task",
13
+ "grade_syntax_task",
14
+ "grade_task",
15
+ "run_pytest_suite",
16
+ ]
17
+
server/graders/common.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Shared deterministic scoring helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import ast
6
+ import difflib
7
+ import traceback
8
+ from typing import Tuple
9
+
10
+
11
+ def clamp_score(value: float) -> float:
12
+ return max(0.0, min(1.0, round(value, 6)))
13
+
14
+
15
+ def syntax_error_message(code: str) -> str:
16
+ try:
17
+ ast.parse(code)
18
+ except SyntaxError as exc:
19
+ return f"{exc.msg} (line {exc.lineno}, column {exc.offset})"
20
+ except Exception:
21
+ return traceback.format_exc(limit=1).strip()
22
+ return ""
23
+
24
+
25
+ def compiles(code: str) -> bool:
26
+ try:
27
+ compile(code, "<candidate>", "exec")
28
+ except Exception:
29
+ return False
30
+ return True
31
+
32
+
33
+ def normalized_diff_score(code: str, reference_code: str) -> float:
34
+ ratio = difflib.SequenceMatcher(
35
+ a="".join(code.split()),
36
+ b="".join(reference_code.split()),
37
+ ).ratio()
38
+ return clamp_score(ratio)
39
+
40
+
41
+ def style_score(code: str, max_line_length: int = 88) -> float:
42
+ lines = code.splitlines() or [""]
43
+ line_length_ok = sum(1 for line in lines if len(line) <= max_line_length) / len(lines)
44
+ tab_ok = 1.0 if all("\t" not in line for line in lines) else 0.0
45
+ trailing_ws_ok = 1.0 if all(line == line.rstrip() for line in lines) else 0.0
46
+ return clamp_score((line_length_ok * 0.6) + (tab_ok * 0.2) + (trailing_ws_ok * 0.2))
47
+
48
+
49
+ def nested_loop_depth(tree: ast.AST) -> int:
50
+ best = 0
51
+
52
+ def walk(node: ast.AST, depth: int) -> None:
53
+ nonlocal best
54
+ if isinstance(node, (ast.For, ast.AsyncFor, ast.While)):
55
+ depth += 1
56
+ best = max(best, depth)
57
+ for child in ast.iter_child_nodes(node):
58
+ walk(child, depth)
59
+
60
+ walk(tree, 0)
61
+ return best
62
+
63
+
64
+ def compile_tree(code: str) -> Tuple[ast.AST | None, str]:
65
+ try:
66
+ return ast.parse(code), ""
67
+ except SyntaxError as exc:
68
+ return None, f"{exc.msg} (line {exc.lineno}, column {exc.offset})"
69
+
server/graders/optimization.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Deterministic grading for optimization tasks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import subprocess
7
+ import sys
8
+ import tempfile
9
+ from pathlib import Path
10
+
11
+ from .common import clamp_score, compile_tree, nested_loop_depth, style_score
12
+ from .pytest_runner import run_pytest_suite
13
+ from ..models import TaskGrade
14
+ from ..tasks.task_bank import TaskSpec
15
+
16
+
17
+ def _benchmark_script(task: TaskSpec) -> str:
18
+ return f"""import json
19
+ import time
20
+ from candidate import {task.benchmark_entrypoint}
21
+
22
+ {task.benchmark_builder}
23
+
24
+ events = build_benchmark_events()
25
+ start = time.perf_counter()
26
+ for _ in range({task.benchmark_repeats}):
27
+ result = {task.benchmark_entrypoint}(events)
28
+ elapsed = time.perf_counter() - start
29
+ Path = __import__("pathlib").Path
30
+ Path("benchmark.json").write_text(json.dumps({{"elapsed": elapsed, "rows": len(result)}}), encoding="utf-8")
31
+ """
32
+
33
+
34
+ def benchmark_runtime(candidate_code: str, task: TaskSpec) -> tuple[float, bool, str]:
35
+ assert task.benchmark_entrypoint is not None
36
+ try:
37
+ with tempfile.TemporaryDirectory(prefix="python-code-review-bench-") as temp_dir:
38
+ temp_path = Path(temp_dir)
39
+ (temp_path / "candidate.py").write_text(candidate_code, encoding="utf-8")
40
+ (temp_path / "starter.py").write_text(task.starter_code, encoding="utf-8")
41
+ (temp_path / "candidate_runner.py").write_text(_benchmark_script(task), encoding="utf-8")
42
+ starter_script = _benchmark_script(task).replace("from candidate import", "from starter import")
43
+ (temp_path / "starter_runner.py").write_text(starter_script, encoding="utf-8")
44
+
45
+ try:
46
+ starter_run = subprocess.run(
47
+ [sys.executable, "starter_runner.py"],
48
+ cwd=temp_path,
49
+ capture_output=True,
50
+ text=True,
51
+ timeout=task.benchmark_timeout_s,
52
+ check=False,
53
+ )
54
+ starter_payload = json.loads((temp_path / "benchmark.json").read_text(encoding="utf-8"))
55
+ candidate_run = subprocess.run(
56
+ [sys.executable, "candidate_runner.py"],
57
+ cwd=temp_path,
58
+ capture_output=True,
59
+ text=True,
60
+ timeout=task.benchmark_timeout_s,
61
+ check=False,
62
+ )
63
+ candidate_payload = json.loads((temp_path / "benchmark.json").read_text(encoding="utf-8"))
64
+ except subprocess.TimeoutExpired as exc:
65
+ output = (exc.stdout or "") + (exc.stderr or "")
66
+ return 0.0, True, (output or "benchmark timed out").strip()
67
+ except Exception as exc:
68
+ return 0.0, False, str(exc)
69
+
70
+ starter_elapsed = max(float(starter_payload["elapsed"]), 1e-9)
71
+ candidate_elapsed = max(float(candidate_payload["elapsed"]), 1e-9)
72
+ speedup = starter_elapsed / candidate_elapsed
73
+ runtime_score = clamp_score(min((speedup - 1.0) / 3.0, 1.0))
74
+ output = "\n".join(
75
+ part
76
+ for part in [
77
+ starter_run.stdout.strip(),
78
+ starter_run.stderr.strip(),
79
+ candidate_run.stdout.strip(),
80
+ candidate_run.stderr.strip(),
81
+ f"starter={starter_elapsed:.6f}s candidate={candidate_elapsed:.6f}s speedup={speedup:.2f}x",
82
+ ]
83
+ if part
84
+ )
85
+ return runtime_score, False, output
86
+ except Exception as exc:
87
+ return 0.0, False, str(exc)
88
+
89
+
90
+ def ast_quality_score(code: str, task: TaskSpec) -> float:
91
+ tree, _ = compile_tree(code)
92
+ if tree is None:
93
+ return 0.0
94
+ import ast
95
+
96
+ function_node = next((node for node in tree.body if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef))), None)
97
+ docstring_points = 0.2 if function_node and ast.get_docstring(function_node, clean=False) else 0.0
98
+ nested_points = 0.4 if nested_loop_depth(tree) <= 1 else 0.0
99
+ marker_points = 0.0
100
+ for marker in task.expected_quality_markers:
101
+ if marker in code:
102
+ marker_points += 0.2
103
+ return clamp_score(docstring_points + nested_points + marker_points)
104
+
105
+
106
+ def grade_optimization_task(candidate_code: str, task: TaskSpec) -> TaskGrade:
107
+ execution = run_pytest_suite(candidate_code, [*task.visible_tests, *task.hidden_tests], timeout_s=task.benchmark_timeout_s)
108
+ test_fraction = execution.passed / execution.total if execution.total else 0.0
109
+
110
+ if execution.timed_out:
111
+ return TaskGrade(score=0.0, tests_passed=execution.passed, tests_total=execution.total, timed_out=True, details={"tests": execution.output})
112
+
113
+ runtime_score, timed_out, benchmark_output = benchmark_runtime(candidate_code, task)
114
+ if timed_out:
115
+ return TaskGrade(score=0.0, tests_passed=execution.passed, tests_total=execution.total, timed_out=True, details={"tests": execution.output, "benchmark": benchmark_output})
116
+
117
+ quality_score = ast_quality_score(candidate_code, task)
118
+ pep8_score = style_score(candidate_code, task.style_max_line_length)
119
+ score = clamp_score((0.5 * test_fraction) + (0.3 * runtime_score) + (0.15 * quality_score) + (0.05 * pep8_score))
120
+ return TaskGrade(
121
+ score=score,
122
+ syntax_score=1.0,
123
+ tests_passed=execution.passed,
124
+ tests_total=execution.total,
125
+ quality_score=quality_score,
126
+ runtime_score=runtime_score,
127
+ details={
128
+ "tests": execution.output,
129
+ "benchmark": benchmark_output,
130
+ "test_fraction": round(test_fraction, 4),
131
+ "runtime_score": round(runtime_score, 4),
132
+ "style_score": round(pep8_score, 4),
133
+ },
134
+ )
135
+
server/graders/pytest_runner.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Helpers for deterministic pytest execution in temp sandboxes."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import subprocess
7
+ import sys
8
+ import tempfile
9
+ from dataclasses import dataclass
10
+ from pathlib import Path
11
+ from typing import Iterable
12
+
13
+
14
+ @dataclass(frozen=True)
15
+ class PytestExecution:
16
+ passed: int
17
+ failed: int
18
+ total: int
19
+ timed_out: bool
20
+ output: str
21
+
22
+
23
+ def _test_module_source(tests: Iterable[str]) -> str:
24
+ blocks: list[str] = ["from candidate import * # noqa: F401,F403"]
25
+ for index, test in enumerate(tests, start=1):
26
+ snippet = str(test).strip()
27
+ if not snippet:
28
+ continue
29
+ if snippet.startswith("def test_"):
30
+ blocks.append(snippet)
31
+ continue
32
+ blocks.append(
33
+ "\n".join(
34
+ [
35
+ f"def test_case_{index:03d}():",
36
+ f" assert {snippet}",
37
+ ]
38
+ )
39
+ )
40
+ return "\n\n".join(blocks) or "def test_placeholder():\n assert True\n"
41
+
42
+
43
+ def _runner_script() -> str:
44
+ return """import json
45
+ import pathlib
46
+ import pytest
47
+
48
+
49
+ class Collector:
50
+ def __init__(self) -> None:
51
+ self.passed = 0
52
+ self.failed = 0
53
+
54
+ def pytest_runtest_logreport(self, report):
55
+ if report.when != "call":
56
+ return
57
+ if report.passed:
58
+ self.passed += 1
59
+ elif report.failed:
60
+ self.failed += 1
61
+
62
+
63
+ collector = Collector()
64
+ exit_code = pytest.main(["-q", "test_candidate.py"], plugins=[collector])
65
+ payload = {
66
+ "passed": collector.passed,
67
+ "failed": collector.failed,
68
+ "exit_code": int(exit_code),
69
+ }
70
+ pathlib.Path("pytest_results.json").write_text(json.dumps(payload), encoding="utf-8")
71
+ """
72
+
73
+
74
+ def run_pytest_suite(candidate_code: str, tests: Iterable[str], timeout_s: float = 3.0) -> PytestExecution:
75
+ test_cases = list(tests)
76
+ try:
77
+ with tempfile.TemporaryDirectory(prefix="python-code-review-") as temp_dir:
78
+ temp_path = Path(temp_dir)
79
+ (temp_path / "candidate.py").write_text(candidate_code, encoding="utf-8")
80
+ (temp_path / "test_candidate.py").write_text(_test_module_source(test_cases), encoding="utf-8")
81
+ (temp_path / "runner.py").write_text(_runner_script(), encoding="utf-8")
82
+
83
+ try:
84
+ completed = subprocess.run(
85
+ [sys.executable, "runner.py"],
86
+ cwd=temp_path,
87
+ capture_output=True,
88
+ text=True,
89
+ timeout=timeout_s,
90
+ check=False,
91
+ )
92
+ except subprocess.TimeoutExpired as exc:
93
+ output = (exc.stdout or "") + (exc.stderr or "")
94
+ return PytestExecution(
95
+ passed=0,
96
+ failed=max(len(test_cases), 1),
97
+ total=max(len(test_cases), 1),
98
+ timed_out=True,
99
+ output=(output or "pytest timed out").strip(),
100
+ )
101
+
102
+ result_path = temp_path / "pytest_results.json"
103
+ if not result_path.exists():
104
+ output = (completed.stdout or "") + (completed.stderr or "")
105
+ total = max(len(test_cases), 1)
106
+ return PytestExecution(0, total, total, False, output.strip())
107
+
108
+ try:
109
+ payload = json.loads(result_path.read_text(encoding="utf-8"))
110
+ except Exception as exc:
111
+ output = ((completed.stdout or "") + (completed.stderr or "")).strip()
112
+ return PytestExecution(0, max(len(test_cases), 1), max(len(test_cases), 1), False, (output or str(exc)).strip())
113
+
114
+ passed = int(payload.get("passed", 0))
115
+ failed = int(payload.get("failed", 0))
116
+ total = max(passed + failed, len(test_cases))
117
+ output = ((completed.stdout or "") + (completed.stderr or "")).strip()
118
+ return PytestExecution(passed, failed, total, False, output)
119
+ except Exception as exc:
120
+ return PytestExecution(0, max(len(test_cases), 1), max(len(test_cases), 1), False, str(exc))
121
+
server/graders/syntax.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Task graders for syntax and bug-fix tasks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .common import clamp_score, compiles, normalized_diff_score, style_score, syntax_error_message
6
+ from .optimization import grade_optimization_task
7
+ from .pytest_runner import run_pytest_suite
8
+ from ..models import TaskGrade
9
+ from ..tasks.task_bank import TaskSpec
10
+
11
+
12
+ def grade_syntax_task(candidate_code: str, task: TaskSpec) -> TaskGrade:
13
+ error = syntax_error_message(candidate_code)
14
+ diff_score = normalized_diff_score(candidate_code, task.reference_code)
15
+ style_base = style_score(candidate_code, task.style_max_line_length)
16
+ if not error:
17
+ return TaskGrade(score=1.0, syntax_score=1.0, quality_score=style_base, details={"compile_error": ""})
18
+ partial = clamp_score(0.15 + (0.55 * diff_score))
19
+ return TaskGrade(score=partial, syntax_score=0.0, quality_score=diff_score * style_base, details={"compile_error": error})
20
+
21
+
22
+ def grade_bug_fix_task(candidate_code: str, task: TaskSpec, include_hidden: bool = True) -> TaskGrade:
23
+ if not compiles(candidate_code):
24
+ error = syntax_error_message(candidate_code)
25
+ return TaskGrade(score=0.0, syntax_score=0.0, details={"compile_error": error})
26
+
27
+ tests = list(task.visible_tests)
28
+ if include_hidden:
29
+ tests.extend(task.hidden_tests)
30
+
31
+ execution = run_pytest_suite(candidate_code, tests, timeout_s=3.0)
32
+ if execution.timed_out:
33
+ return TaskGrade(
34
+ score=0.0,
35
+ syntax_score=1.0,
36
+ tests_passed=execution.passed,
37
+ tests_total=execution.total,
38
+ timed_out=True,
39
+ details={"compile_error": "", "tests": execution.output},
40
+ )
41
+
42
+ pass_fraction = execution.passed / execution.total if execution.total else 0.0
43
+ quality = style_score(candidate_code, task.style_max_line_length)
44
+ return TaskGrade(
45
+ score=clamp_score(pass_fraction),
46
+ syntax_score=1.0,
47
+ tests_passed=execution.passed,
48
+ tests_total=execution.total,
49
+ quality_score=quality,
50
+ details={"compile_error": "", "tests": execution.output},
51
+ )
52
+
53
+
54
+ def grade_task(candidate_code: str, task: TaskSpec, include_hidden: bool = True) -> TaskGrade:
55
+ if task.task_kind == "syntax_fix":
56
+ return grade_syntax_task(candidate_code, task)
57
+ if task.task_kind == "bug_fix":
58
+ return grade_bug_fix_task(candidate_code, task, include_hidden=include_hidden)
59
+ return grade_optimization_task(candidate_code, task)
60
+
server/models.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Typed models for the self-contained server package."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Dict, List, Literal, Optional
6
+
7
+ from pydantic import BaseModel, Field
8
+
9
+ from .compat import Action, Observation, State
10
+
11
+
12
+ Difficulty = Literal["easy", "medium", "hard"]
13
+ TaskKind = Literal["syntax_fix", "bug_fix", "optimization"]
14
+ ActionType = Literal["analyze_code", "edit_code", "run_tests", "submit_solution"]
15
+ Category = Literal["bug", "security", "performance", "maintainability", "style", "testing"]
16
+ Severity = Literal["critical", "warning", "info"]
17
+
18
+
19
+ class HistoryEntry(BaseModel):
20
+ step: int = Field(..., ge=0)
21
+ action_type: ActionType
22
+ status: str
23
+ reward: float
24
+
25
+
26
+ class RewardDetails(BaseModel):
27
+ value: float
28
+ syntax_reward: float = 0.0
29
+ test_reward: float = 0.0
30
+ quality_bonus: float = 0.0
31
+ correctness_bonus: float = 0.0
32
+ progress_delta: float = 0.0
33
+ stagnation_penalty: float = 0.0
34
+ regression_penalty: float = 0.0
35
+ invalid_action_penalty: float = 0.0
36
+ timeout_penalty: float = 0.0
37
+ reason: str
38
+ prev_score: float = 0.0
39
+ curr_score: float = 0.0
40
+ code_changed: bool = False
41
+
42
+
43
+ class PythonCodeReviewAction(Action):
44
+ action_type: ActionType
45
+ code: Optional[str] = None
46
+
47
+
48
+ class PythonCodeReviewObservation(Observation):
49
+ task_id: str
50
+ title: str = ""
51
+ difficulty: Difficulty
52
+ task_kind: Optional[TaskKind] = None
53
+ task_description: str
54
+ current_code: str
55
+ errors: str
56
+ test_results: str
57
+ visible_tests: List[str] = Field(default_factory=list)
58
+ history: List[HistoryEntry] = Field(default_factory=list)
59
+ attempts_remaining: int = Field(..., ge=0)
60
+ last_action_status: str = ""
61
+ score: float = Field(..., ge=0.0, le=1.0)
62
+ reward_details: RewardDetails = Field(
63
+ default_factory=lambda: RewardDetails(value=0.0, reason="Reset")
64
+ )
65
+
66
+
67
+ class PythonCodeReviewState(State):
68
+ episode_id: str
69
+ step_count: int = Field(default=0, ge=0)
70
+ task_id: Optional[str] = None
71
+ difficulty: Optional[Difficulty] = None
72
+ task_kind: Optional[TaskKind] = None
73
+ attempts_remaining: int = Field(default=0, ge=0)
74
+ current_code: str = ""
75
+ errors: str = ""
76
+ test_results: str = ""
77
+ history: List[HistoryEntry] = Field(default_factory=list)
78
+ score: float = Field(default=0.0, ge=0.0, le=1.0)
79
+ done: bool = False
80
+
81
+
82
+ class TaskDescriptor(BaseModel):
83
+ task_id: str
84
+ title: str
85
+ difficulty: Difficulty
86
+ task_kind: Optional[TaskKind] = None
87
+ task_description: str = ""
88
+ starter_code: str = ""
89
+ visible_tests: List[str] = Field(default_factory=list)
90
+ goal: str = ""
91
+ repo_summary: str = ""
92
+ changed_files: List[str] = Field(default_factory=list)
93
+ available_files: List[str] = Field(default_factory=list)
94
+ max_steps: int = Field(..., ge=1)
95
+
96
+
97
+ class TaskSummary(BaseModel):
98
+ task_id: str
99
+ difficulty: Difficulty
100
+ title: str
101
+ goal: str = ""
102
+
103
+
104
+ class ReviewFinding(BaseModel):
105
+ title: str
106
+ file_path: str = ""
107
+ line: Optional[int] = Field(default=None, ge=1)
108
+ category: Category = "bug"
109
+ severity: Severity = "warning"
110
+ rationale: str = ""
111
+ recommendation: str = ""
112
+ rule_id: str = ""
113
+
114
+ @property
115
+ def explanation(self) -> str:
116
+ return self.rationale
117
+
118
+ @property
119
+ def suggested_fix(self) -> str:
120
+ return self.recommendation
121
+
122
+
123
+ class DirectReviewResponse(BaseModel):
124
+ issues: List[ReviewFinding] = Field(default_factory=list)
125
+ summary: str = ""
126
+ score: float = Field(default=0.0, ge=0.0, le=1.0)
127
+ improved_code: Optional[str] = None
128
+
129
+
130
+ class TaskGrade(BaseModel):
131
+ score: float = Field(..., ge=0.0, le=1.0)
132
+ syntax_score: float = Field(default=0.0, ge=0.0, le=1.0)
133
+ tests_passed: int = Field(default=0, ge=0)
134
+ tests_total: int = Field(default=0, ge=0)
135
+ quality_score: float = Field(default=0.0, ge=0.0, le=1.0)
136
+ runtime_score: float = Field(default=0.0, ge=0.0, le=1.0)
137
+ timed_out: bool = False
138
+ matched_issue_ids: List[str] = Field(default_factory=list)
139
+ false_positives: int = Field(default=0, ge=0)
140
+ duplicate_findings: int = Field(default=0, ge=0)
141
+ matched_weight: float = Field(default=0.0, ge=0.0, le=1.0)
142
+ details: Dict[str, Any] = Field(default_factory=dict)
143
+
144
+
145
+ class HealthResponse(BaseModel):
146
+ status: Literal["ok"] = "ok"
147
+ environment: str = "python_code_review_env"
148
+ task_count: int = Field(default=0, ge=0)
149
+
server/tasks/__init__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Self-contained task definitions for container builds."""
2
+
3
+ from .task_bank import TaskSpec, get_task, list_task_descriptors, list_task_summaries, task_ids
4
+
5
+ __all__ = [
6
+ "TaskSpec",
7
+ "get_task",
8
+ "list_task_descriptors",
9
+ "list_task_summaries",
10
+ "task_ids",
11
+ ]
12
+
server/tasks/task_bank.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Deterministic task bank for self-contained server builds."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Dict, List, Optional
7
+
8
+ from ..models import Difficulty, TaskDescriptor, TaskKind
9
+
10
+
11
+ @dataclass(frozen=True)
12
+ class TaskSpec:
13
+ task_id: str
14
+ title: str
15
+ difficulty: Difficulty
16
+ task_kind: TaskKind
17
+ task_description: str
18
+ starter_code: str
19
+ reference_code: str
20
+ visible_tests: List[str]
21
+ hidden_tests: List[str]
22
+ max_steps: int = 10
23
+ benchmark_entrypoint: Optional[str] = None
24
+ benchmark_builder: Optional[str] = None
25
+ benchmark_repeats: int = 1
26
+ benchmark_timeout_s: float = 2.0
27
+ style_max_line_length: int = 88
28
+ expected_quality_markers: List[str] = field(default_factory=list)
29
+
30
+ def to_descriptor(self) -> TaskDescriptor:
31
+ return TaskDescriptor(
32
+ task_id=self.task_id,
33
+ title=self.title,
34
+ difficulty=self.difficulty,
35
+ task_kind=self.task_kind,
36
+ task_description=self.task_description,
37
+ starter_code=self.starter_code,
38
+ visible_tests=list(self.visible_tests),
39
+ max_steps=self.max_steps,
40
+ )
41
+
42
+
43
+ TASK_SYNTAX_FIX = TaskSpec(
44
+ task_id="syntax-fix-easy",
45
+ title="Fix a syntax-broken username normalizer",
46
+ difficulty="easy",
47
+ task_kind="syntax_fix",
48
+ task_description=(
49
+ "You are reviewing a utility function before merge. The submitted patch left "
50
+ "the function with syntax errors. Repair the code so it compiles and preserves "
51
+ "the intended behavior of trimming, lowercasing, and replacing spaces with underscores."
52
+ ),
53
+ starter_code='''def normalize_username(raw_name: str) -> str:
54
+ cleaned = raw_name.strip().lower(
55
+ if not cleaned:
56
+ return "anonymous"
57
+ return cleaned.replace(" ", "_")
58
+ ''',
59
+ reference_code='''def normalize_username(raw_name: str) -> str:
60
+ cleaned = raw_name.strip().lower()
61
+ if not cleaned:
62
+ return "anonymous"
63
+ return cleaned.replace(" ", "_")
64
+ ''',
65
+ visible_tests=[
66
+ "normalize_username(' Alice Smith ') == 'alice_smith'",
67
+ "normalize_username(' ') == 'anonymous'",
68
+ "normalize_username('Bob') == 'bob'",
69
+ ],
70
+ hidden_tests=[
71
+ "normalize_username(' HELLO WORLD ') == 'hello_world'",
72
+ "normalize_username('') == 'anonymous'",
73
+ ],
74
+ max_steps=8,
75
+ )
76
+
77
+
78
+ TASK_BUG_FIX = TaskSpec(
79
+ task_id="bug-fix-medium",
80
+ title="Repair invoice discount calculation logic",
81
+ difficulty="medium",
82
+ task_kind="bug_fix",
83
+ task_description=(
84
+ "A billing helper function is returning the wrong amount after applying discounts. "
85
+ "The function signature is correct, but the calculation logic is broken. "
86
+ "Inspect the implementation, run visible tests, and fix the bug so all tests pass. "
87
+ "Do not change the function signature or validation logic."
88
+ ),
89
+ starter_code='''from typing import Iterable
90
+
91
+
92
+ def calculate_invoice_total(line_items: Iterable[int], discount_percent: int) -> int:
93
+ """Calculate invoice total with discount applied."""
94
+ if discount_percent < 0 or discount_percent > 100:
95
+ raise ValueError("discount_percent must be between 0 and 100")
96
+
97
+ subtotal = sum(line_items)
98
+ discounted_total = subtotal - (subtotal * discount_percent // 100)
99
+ return subtotal
100
+ ''',
101
+ reference_code='''from typing import Iterable
102
+
103
+
104
+ def calculate_invoice_total(line_items: Iterable[int], discount_percent: int) -> int:
105
+ """Calculate invoice total with discount applied."""
106
+ if discount_percent < 0 or discount_percent > 100:
107
+ raise ValueError("discount_percent must be between 0 and 100")
108
+
109
+ subtotal = sum(line_items)
110
+ discounted_total = subtotal - (subtotal * discount_percent // 100)
111
+ return discounted_total
112
+ ''',
113
+ visible_tests=[
114
+ "calculate_invoice_total([1000, 2000], 0) == 3000",
115
+ "calculate_invoice_total([1000, 2000], 50) == 1500",
116
+ "calculate_invoice_total([1000], 10) == 900",
117
+ "calculate_invoice_total([], 0) == 0",
118
+ ],
119
+ hidden_tests=[
120
+ "calculate_invoice_total([100, 200, 300], 25) == 450",
121
+ "calculate_invoice_total([5000], 99) == 50",
122
+ ],
123
+ max_steps=10,
124
+ )
125
+
126
+
127
+ TASK_OPTIMIZATION = TaskSpec(
128
+ task_id="optimization-hard",
129
+ title="Optimize inefficient user activity summarization",
130
+ difficulty="hard",
131
+ task_kind="optimization",
132
+ task_description=(
133
+ "Code review found that `summarize_user_activity` is inefficient for large event streams. "
134
+ "The current implementation repeatedly scans the full event list for every user, making it O(n**2). "
135
+ "Refactor it to aggregate counts in one pass while preserving the sorted output contract. "
136
+ "Style and code quality also matter: use idiomatic Python, proper types, and clear logic. "
137
+ "All tests must pass, and the optimized version should be measurably faster."
138
+ ),
139
+ starter_code='''from typing import Iterable
140
+
141
+
142
+ def summarize_user_activity(events: Iterable[dict]) -> list[tuple[str, int]]:
143
+ """Aggregate user activity counts."""
144
+
145
+ ordered_users = []
146
+ for event in events:
147
+ user_id = event["user_id"]
148
+ if user_id not in ordered_users:
149
+ ordered_users.append(user_id)
150
+
151
+ summary = []
152
+ for user_id in ordered_users:
153
+ count = 0
154
+ for event in events:
155
+ if event["user_id"] == user_id:
156
+ count += 1
157
+ summary.append((user_id, count))
158
+ return sorted(summary, key=lambda item: (-item[1], item[0]))
159
+ ''',
160
+ reference_code='''from collections import Counter
161
+ from typing import Iterable
162
+
163
+
164
+ def summarize_user_activity(events: Iterable[dict]) -> list[tuple[str, int]]:
165
+ """Aggregate user activity counts in one pass."""
166
+
167
+ counts = Counter(event["user_id"] for event in events)
168
+ return sorted(counts.items(), key=lambda item: (-item[1], item[0]))
169
+ ''',
170
+ visible_tests=[
171
+ "summarize_user_activity([{'user_id': 'alice'}, {'user_id': 'bob'}, {'user_id': 'alice'}]) == [('alice', 2), ('bob', 1)]",
172
+ "summarize_user_activity([{'user_id': 'z'}, {'user_id': 'a'}]) == [('a', 1), ('z', 1)]",
173
+ "summarize_user_activity([]) == []",
174
+ "summarize_user_activity([{'user_id': 'solo'}]) == [('solo', 1)]",
175
+ ],
176
+ hidden_tests=[
177
+ "summarize_user_activity([{'user_id': 'u2'}, {'user_id': 'u1'}, {'user_id': 'u2'}, {'user_id': 'u2'}, {'user_id': 'u1'}]) == [('u2', 3), ('u1', 2)]",
178
+ ],
179
+ max_steps=10,
180
+ benchmark_entrypoint="summarize_user_activity",
181
+ benchmark_builder='''def build_benchmark_events():
182
+ return [{"user_id": f"user_{index % 400}"} for index in range(6000)]''',
183
+ benchmark_repeats=3,
184
+ benchmark_timeout_s=1.0,
185
+ style_max_line_length=88,
186
+ expected_quality_markers=["Counter", "sorted"],
187
+ )
188
+
189
+
190
+ TASKS: Dict[str, TaskSpec] = {
191
+ "syntax-fix-easy": TASK_SYNTAX_FIX,
192
+ "bug-fix-medium": TASK_BUG_FIX,
193
+ "optimization-hard": TASK_OPTIMIZATION,
194
+ }
195
+
196
+
197
+ def task_ids() -> List[str]:
198
+ return ["syntax-fix-easy", "bug-fix-medium", "optimization-hard"]
199
+
200
+
201
+ def get_task(task_id: str) -> TaskSpec:
202
+ if task_id not in TASKS:
203
+ raise ValueError(f"Task {task_id} not found. Available: {list(TASKS.keys())}")
204
+ return TASKS[task_id]
205
+
206
+
207
+ def list_task_descriptors() -> List[TaskDescriptor]:
208
+ return [get_task(tid).to_descriptor() for tid in task_ids()]
209
+
210
+
211
+ def list_task_summaries() -> List[TaskDescriptor]:
212
+ return list_task_descriptors()
213
+