Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- Dockerfile +3 -3
- inference.py +37 -7
- pytest-cache-files-1f62ra1g/container_sim/server/Dockerfile +32 -0
- pytest-cache-files-1f62ra1g/container_sim/server/__init__.py +5 -0
- pytest-cache-files-1f62ra1g/container_sim/server/app.py +127 -0
- pytest-cache-files-1f62ra1g/container_sim/server/code_review_env_environment.py +9 -0
- pytest-cache-files-1f62ra1g/container_sim/server/code_review_environment.py +5 -0
- pytest-cache-files-1f62ra1g/container_sim/server/compat.py +89 -0
- pytest-cache-files-1f62ra1g/container_sim/server/env.py +1 -0
- pytest-cache-files-1f62ra1g/container_sim/server/env_safe.py +505 -0
- pytest-cache-files-1f62ra1g/container_sim/server/graders/__init__.py +17 -0
- pytest-cache-files-1f62ra1g/container_sim/server/graders/common.py +69 -0
- pytest-cache-files-1f62ra1g/container_sim/server/graders/optimization.py +135 -0
- pytest-cache-files-1f62ra1g/container_sim/server/graders/pytest_runner.py +121 -0
- pytest-cache-files-1f62ra1g/container_sim/server/graders/syntax.py +60 -0
- pytest-cache-files-1f62ra1g/container_sim/server/grading.py +147 -0
- pytest-cache-files-1f62ra1g/container_sim/server/models.py +149 -0
- pytest-cache-files-1f62ra1g/container_sim/server/python_env_environment.py +9 -0
- pytest-cache-files-1f62ra1g/container_sim/server/requirements.txt +6 -0
- pytest-cache-files-1f62ra1g/container_sim/server/static_review.py +273 -0
- pytest-cache-files-1f62ra1g/container_sim/server/task_bank.py +340 -0
- pytest-cache-files-1f62ra1g/container_sim/server/tasks/__init__.py +12 -0
- pytest-cache-files-1f62ra1g/container_sim/server/tasks/task_bank.py +213 -0
- server/app.py +21 -11
- server/compat.py +89 -0
- server/env_safe.py +26 -13
- server/graders/__init__.py +17 -0
- server/graders/common.py +69 -0
- server/graders/optimization.py +135 -0
- server/graders/pytest_runner.py +121 -0
- server/graders/syntax.py +60 -0
- server/models.py +149 -0
- server/tasks/__init__.py +12 -0
- server/tasks/task_bank.py +213 -0
Dockerfile
CHANGED
|
@@ -10,11 +10,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
| 10 |
&& rm -rf /var/lib/apt/lists/*
|
| 11 |
|
| 12 |
# Install Python dependencies
|
| 13 |
-
COPY
|
| 14 |
RUN pip install --no-cache-dir -r /app/server/requirements.txt
|
| 15 |
|
| 16 |
-
# Copy
|
| 17 |
-
COPY . /app
|
| 18 |
|
| 19 |
# Set environment variables
|
| 20 |
ENV PYTHONUNBUFFERED=1
|
|
|
|
| 10 |
&& rm -rf /var/lib/apt/lists/*
|
| 11 |
|
| 12 |
# Install Python dependencies
|
| 13 |
+
COPY requirements.txt /app/server/requirements.txt
|
| 14 |
RUN pip install --no-cache-dir -r /app/server/requirements.txt
|
| 15 |
|
| 16 |
+
# Copy the self-contained server package
|
| 17 |
+
COPY . /app/server
|
| 18 |
|
| 19 |
# Set environment variables
|
| 20 |
ENV PYTHONUNBUFFERED=1
|
inference.py
CHANGED
|
@@ -404,7 +404,7 @@ def run_env(client: Optional[Any], model: str) -> Dict[str, Any]:
|
|
| 404 |
|
| 405 |
|
| 406 |
def format_step_message(result: Dict[str, Any]) -> str:
|
| 407 |
-
"""Format the
|
| 408 |
try:
|
| 409 |
fallback = bool(result.get("fallback", False))
|
| 410 |
reason = safe_text(result.get("reason", "completed"), "completed").lower().replace(" ", "_")
|
|
@@ -429,21 +429,49 @@ def format_step_message(result: Dict[str, Any]) -> str:
|
|
| 429 |
return "error handled: formatting_failed"
|
| 430 |
|
| 431 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 432 |
def main() -> int:
|
| 433 |
"""Run the inference workflow and always terminate successfully."""
|
|
|
|
| 434 |
step_message = "error handled: initialization_failed"
|
|
|
|
|
|
|
| 435 |
try:
|
| 436 |
model_name = safe_env("MODEL_NAME", DEFAULT_MODEL_NAME) or DEFAULT_MODEL_NAME
|
| 437 |
client = create_client()
|
| 438 |
result = run_env(client, model_name)
|
| 439 |
step_message = format_step_message(result)
|
|
|
|
| 440 |
except BaseException as exc:
|
| 441 |
step_message = f"error handled: {safe_text(exc, 'unexpected_failure').lower().replace(' ', '_')[:64]}"
|
|
|
|
| 442 |
finally:
|
| 443 |
try:
|
| 444 |
-
|
| 445 |
-
print(f"STEP: {step_message}")
|
| 446 |
-
print("END")
|
| 447 |
except Exception:
|
| 448 |
pass
|
| 449 |
return 0
|
|
@@ -454,9 +482,11 @@ if __name__ == "__main__":
|
|
| 454 |
main()
|
| 455 |
except BaseException:
|
| 456 |
try:
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
|
|
|
|
|
|
| 460 |
except Exception:
|
| 461 |
pass
|
| 462 |
sys.exit(0)
|
|
|
|
| 404 |
|
| 405 |
|
| 406 |
def format_step_message(result: Dict[str, Any]) -> str:
|
| 407 |
+
"""Format the structured STEP payload for stdout."""
|
| 408 |
try:
|
| 409 |
fallback = bool(result.get("fallback", False))
|
| 410 |
reason = safe_text(result.get("reason", "completed"), "completed").lower().replace(" ", "_")
|
|
|
|
| 429 |
return "error handled: formatting_failed"
|
| 430 |
|
| 431 |
|
| 432 |
+
def format_start_message() -> str:
|
| 433 |
+
"""Format the START payload for stdout."""
|
| 434 |
+
return "task=python_code_review_env"
|
| 435 |
+
|
| 436 |
+
|
| 437 |
+
def format_end_message(result: Optional[Dict[str, Any]]) -> str:
|
| 438 |
+
"""Format the structured END payload for stdout."""
|
| 439 |
+
try:
|
| 440 |
+
payload = result or {}
|
| 441 |
+
status = safe_text(payload.get("status", "ok"), "ok").lower().replace(" ", "_")
|
| 442 |
+
score = safe_float(payload.get("score", 0.0), 0.0)
|
| 443 |
+
done = str(bool(payload.get("done", True))).lower()
|
| 444 |
+
fallback = str(bool(payload.get("fallback", True))).lower()
|
| 445 |
+
return f"task=python_code_review_env status={status} score={score:.4f} done={done} fallback={fallback}"
|
| 446 |
+
except Exception:
|
| 447 |
+
return "task=python_code_review_env status=ok score=0.0000 done=true fallback=true"
|
| 448 |
+
|
| 449 |
+
|
| 450 |
+
def emit_structured_output(start_message: str, step_message: str, end_message: str) -> None:
|
| 451 |
+
"""Emit evaluator-readable output blocks to stdout."""
|
| 452 |
+
print(f"[START] {start_message}", flush=True)
|
| 453 |
+
print(f"[STEP] {step_message}", flush=True)
|
| 454 |
+
print(f"[END] {end_message}", flush=True)
|
| 455 |
+
|
| 456 |
+
|
| 457 |
def main() -> int:
|
| 458 |
"""Run the inference workflow and always terminate successfully."""
|
| 459 |
+
start_message = format_start_message()
|
| 460 |
step_message = "error handled: initialization_failed"
|
| 461 |
+
end_message = "task=python_code_review_env status=ok score=0.0000 done=true fallback=true"
|
| 462 |
+
result: Optional[Dict[str, Any]] = None
|
| 463 |
try:
|
| 464 |
model_name = safe_env("MODEL_NAME", DEFAULT_MODEL_NAME) or DEFAULT_MODEL_NAME
|
| 465 |
client = create_client()
|
| 466 |
result = run_env(client, model_name)
|
| 467 |
step_message = format_step_message(result)
|
| 468 |
+
end_message = format_end_message(result)
|
| 469 |
except BaseException as exc:
|
| 470 |
step_message = f"error handled: {safe_text(exc, 'unexpected_failure').lower().replace(' ', '_')[:64]}"
|
| 471 |
+
end_message = format_end_message(result)
|
| 472 |
finally:
|
| 473 |
try:
|
| 474 |
+
emit_structured_output(start_message, step_message, end_message)
|
|
|
|
|
|
|
| 475 |
except Exception:
|
| 476 |
pass
|
| 477 |
return 0
|
|
|
|
| 482 |
main()
|
| 483 |
except BaseException:
|
| 484 |
try:
|
| 485 |
+
emit_structured_output(
|
| 486 |
+
format_start_message(),
|
| 487 |
+
"error handled: fatal_guard",
|
| 488 |
+
"task=python_code_review_env status=ok score=0.0000 done=true fallback=true",
|
| 489 |
+
)
|
| 490 |
except Exception:
|
| 491 |
pass
|
| 492 |
sys.exit(0)
|
pytest-cache-files-1f62ra1g/container_sim/server/Dockerfile
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
# Install system dependencies
|
| 6 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 7 |
+
gcc \
|
| 8 |
+
git \
|
| 9 |
+
curl \
|
| 10 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 11 |
+
|
| 12 |
+
# Install Python dependencies
|
| 13 |
+
COPY requirements.txt /app/server/requirements.txt
|
| 14 |
+
RUN pip install --no-cache-dir -r /app/server/requirements.txt
|
| 15 |
+
|
| 16 |
+
# Copy the self-contained server package
|
| 17 |
+
COPY . /app/server
|
| 18 |
+
|
| 19 |
+
# Set environment variables
|
| 20 |
+
ENV PYTHONUNBUFFERED=1
|
| 21 |
+
ENV HOST=0.0.0.0
|
| 22 |
+
ENV PORT=8000
|
| 23 |
+
ENV WORKERS=1
|
| 24 |
+
ENV MAX_CONCURRENT_ENVS=16
|
| 25 |
+
|
| 26 |
+
# Health check
|
| 27 |
+
HEALTHCHECK --interval=30s --timeout=5s --start-period=15s --retries=3 \
|
| 28 |
+
CMD curl -f http://localhost:${PORT}/health || exit 1
|
| 29 |
+
|
| 30 |
+
# Run FastAPI app
|
| 31 |
+
EXPOSE ${PORT}
|
| 32 |
+
CMD ["python", "-m", "server.app"]
|
pytest-cache-files-1f62ra1g/container_sim/server/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Server exports for the Python code review environment."""
|
| 2 |
+
|
| 3 |
+
from .code_review_environment import CodeReviewEnvironment, PythonCodeReviewEnvironment, PythonEnvironment
|
| 4 |
+
|
| 5 |
+
__all__ = ["PythonEnvironment", "PythonCodeReviewEnvironment", "CodeReviewEnvironment"]
|
pytest-cache-files-1f62ra1g/container_sim/server/app.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""FastAPI application for the Python code review environment."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
from fastapi import APIRouter, HTTPException
|
| 8 |
+
from fastapi.responses import RedirectResponse
|
| 9 |
+
|
| 10 |
+
try:
|
| 11 |
+
from compat import create_app
|
| 12 |
+
from models import (
|
| 13 |
+
HealthResponse,
|
| 14 |
+
PythonCodeReviewAction,
|
| 15 |
+
PythonCodeReviewObservation,
|
| 16 |
+
PythonCodeReviewState,
|
| 17 |
+
TaskDescriptor,
|
| 18 |
+
TaskGrade,
|
| 19 |
+
)
|
| 20 |
+
except Exception:
|
| 21 |
+
from .compat import create_app
|
| 22 |
+
from .models import (
|
| 23 |
+
HealthResponse,
|
| 24 |
+
PythonCodeReviewAction,
|
| 25 |
+
PythonCodeReviewObservation,
|
| 26 |
+
PythonCodeReviewState,
|
| 27 |
+
TaskDescriptor,
|
| 28 |
+
TaskGrade,
|
| 29 |
+
)
|
| 30 |
+
from server.env import PythonCodeReviewEnvironment
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
try:
|
| 34 |
+
MAX_CONCURRENT_ENVS = max(int(os.getenv("MAX_CONCURRENT_ENVS", "16")), 1)
|
| 35 |
+
except Exception:
|
| 36 |
+
MAX_CONCURRENT_ENVS = 16
|
| 37 |
+
|
| 38 |
+
python_env = PythonCodeReviewEnvironment(verbose=False)
|
| 39 |
+
app = create_app(
|
| 40 |
+
PythonCodeReviewEnvironment,
|
| 41 |
+
PythonCodeReviewAction,
|
| 42 |
+
PythonCodeReviewObservation,
|
| 43 |
+
max_concurrent_envs=MAX_CONCURRENT_ENVS,
|
| 44 |
+
)
|
| 45 |
+
router = APIRouter(tags=["python-code-review"])
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
@router.get("/", include_in_schema=False)
|
| 49 |
+
def root() -> RedirectResponse:
|
| 50 |
+
"""Redirect root to API documentation."""
|
| 51 |
+
return RedirectResponse(url="/docs")
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
@router.get("/health", response_model=HealthResponse)
|
| 55 |
+
def health() -> HealthResponse:
|
| 56 |
+
"""Health check endpoint for deployment monitoring."""
|
| 57 |
+
return python_env.health()
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
@router.get("/tasks", response_model=list)
|
| 61 |
+
def list_tasks() -> list:
|
| 62 |
+
"""List all available deterministic tasks."""
|
| 63 |
+
return python_env.list_task_summaries()
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
@router.get("/tasks/{task_id}", response_model=object)
|
| 67 |
+
def get_task(task_id: str) -> object:
|
| 68 |
+
"""Get a specific task by ID."""
|
| 69 |
+
try:
|
| 70 |
+
return python_env.get_task(task_id)
|
| 71 |
+
except ValueError as exc:
|
| 72 |
+
raise HTTPException(status_code=404, detail=str(exc)) from exc
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
@router.post("/tasks/{task_id}/grade", response_model=TaskGrade)
|
| 76 |
+
def grade_task(task_id: str, payload: PythonCodeReviewAction) -> TaskGrade:
|
| 77 |
+
"""Grade code submission for a task without running an episode."""
|
| 78 |
+
if payload.action_type != "edit_code" or not payload.code:
|
| 79 |
+
raise HTTPException(
|
| 80 |
+
status_code=400,
|
| 81 |
+
detail="Requires action_type='edit_code' with code parameter."
|
| 82 |
+
)
|
| 83 |
+
try:
|
| 84 |
+
return python_env.grade_task_submission(task_id=task_id, code=payload.code)
|
| 85 |
+
except ValueError as exc:
|
| 86 |
+
raise HTTPException(status_code=404, detail=str(exc)) from exc
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
@router.post("/state", response_model=PythonCodeReviewState)
|
| 90 |
+
def get_state_post() -> RedirectResponse:
|
| 91 |
+
"""Redirect POST /state to GET for compatibility."""
|
| 92 |
+
return RedirectResponse(url="/state", status_code=303)
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
app.include_router(router)
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def _prioritize_route(path: str, methods: set[str]) -> None:
|
| 99 |
+
"""Move a matching custom route ahead of default OpenEnv routes."""
|
| 100 |
+
try:
|
| 101 |
+
for index in range(len(app.router.routes) - 1, -1, -1):
|
| 102 |
+
route = app.router.routes[index]
|
| 103 |
+
route_path = getattr(route, "path", None)
|
| 104 |
+
route_methods = set(getattr(route, "methods", set()) or set())
|
| 105 |
+
if route_path == path and methods.issubset(route_methods):
|
| 106 |
+
app.router.routes.insert(0, app.router.routes.pop(index))
|
| 107 |
+
break
|
| 108 |
+
except Exception:
|
| 109 |
+
pass
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
_prioritize_route("/health", {"GET"})
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def main(host: str = "0.0.0.0", port: int = 8000) -> None:
|
| 116 |
+
"""Run the FastAPI application with uvicorn."""
|
| 117 |
+
import uvicorn
|
| 118 |
+
uvicorn.run(
|
| 119 |
+
app,
|
| 120 |
+
host=os.getenv("HOST", host),
|
| 121 |
+
port=int(os.getenv("PORT", str(port))),
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
if __name__ == "__main__":
|
| 126 |
+
main()
|
| 127 |
+
|
pytest-cache-files-1f62ra1g/container_sim/server/code_review_env_environment.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Compatibility shim for older imports."""
|
| 2 |
+
|
| 3 |
+
try:
|
| 4 |
+
from server.code_review_environment import CodeReviewEnvironment
|
| 5 |
+
except ModuleNotFoundError: # pragma: no cover
|
| 6 |
+
from .code_review_environment import CodeReviewEnvironment
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
__all__ = ["CodeReviewEnvironment"]
|
pytest-cache-files-1f62ra1g/container_sim/server/code_review_environment.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Compatibility wrapper for older imports."""
|
| 2 |
+
|
| 3 |
+
from .env import CodeReviewEnvironment, PythonCodeReviewEnvironment, PythonEnvironment
|
| 4 |
+
|
| 5 |
+
__all__ = ["CodeReviewEnvironment", "PythonCodeReviewEnvironment", "PythonEnvironment"]
|
pytest-cache-files-1f62ra1g/container_sim/server/compat.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Compatibility helpers for OpenEnv and FastMCP runtime drift."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import sys
|
| 6 |
+
import types
|
| 7 |
+
from typing import Any
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def install_openenv_fastmcp_compat() -> None:
|
| 11 |
+
"""Patch FastMCP API differences so older OpenEnv builds keep importing."""
|
| 12 |
+
try:
|
| 13 |
+
import fastmcp # type: ignore
|
| 14 |
+
except Exception:
|
| 15 |
+
return
|
| 16 |
+
|
| 17 |
+
try:
|
| 18 |
+
if not hasattr(fastmcp, "Client"):
|
| 19 |
+
class CompatClient:
|
| 20 |
+
"""Minimal async MCP client used for legacy OpenEnv imports."""
|
| 21 |
+
|
| 22 |
+
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
| 23 |
+
self.args = args
|
| 24 |
+
self.kwargs = kwargs
|
| 25 |
+
|
| 26 |
+
async def __aenter__(self) -> "CompatClient":
|
| 27 |
+
return self
|
| 28 |
+
|
| 29 |
+
async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> bool:
|
| 30 |
+
return False
|
| 31 |
+
|
| 32 |
+
async def list_tools(self) -> list[Any]:
|
| 33 |
+
return []
|
| 34 |
+
|
| 35 |
+
async def call_tool(self, tool_name: str, arguments: dict[str, Any]) -> Any:
|
| 36 |
+
raise RuntimeError(
|
| 37 |
+
f"MCP client compatibility mode cannot call tool: {tool_name}"
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
fastmcp.Client = CompatClient # type: ignore[attr-defined]
|
| 41 |
+
except Exception:
|
| 42 |
+
pass
|
| 43 |
+
|
| 44 |
+
try:
|
| 45 |
+
client_pkg = sys.modules.get("fastmcp.client")
|
| 46 |
+
if client_pkg is None:
|
| 47 |
+
client_pkg = types.ModuleType("fastmcp.client")
|
| 48 |
+
sys.modules["fastmcp.client"] = client_pkg
|
| 49 |
+
|
| 50 |
+
client_mod = sys.modules.get("fastmcp.client.client")
|
| 51 |
+
if client_mod is None:
|
| 52 |
+
client_mod = types.ModuleType("fastmcp.client.client")
|
| 53 |
+
sys.modules["fastmcp.client.client"] = client_mod
|
| 54 |
+
|
| 55 |
+
if not hasattr(client_mod, "CallToolResult"):
|
| 56 |
+
class CallToolResult:
|
| 57 |
+
"""Compatibility container for legacy OpenEnv response handling."""
|
| 58 |
+
|
| 59 |
+
def __init__(
|
| 60 |
+
self,
|
| 61 |
+
content: Any = None,
|
| 62 |
+
structured_content: Any = None,
|
| 63 |
+
meta: Any = None,
|
| 64 |
+
data: Any = None,
|
| 65 |
+
is_error: bool = False,
|
| 66 |
+
) -> None:
|
| 67 |
+
self.content = content
|
| 68 |
+
self.structured_content = structured_content
|
| 69 |
+
self.meta = meta
|
| 70 |
+
self.data = data
|
| 71 |
+
self.is_error = is_error
|
| 72 |
+
|
| 73 |
+
client_mod.CallToolResult = CallToolResult
|
| 74 |
+
|
| 75 |
+
client_pkg.client = client_mod # type: ignore[attr-defined]
|
| 76 |
+
except Exception:
|
| 77 |
+
pass
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
install_openenv_fastmcp_compat()
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
from openenv.core.env_server.http_server import create_app as openenv_create_app
|
| 84 |
+
from openenv.core.env_server.interfaces import Environment
|
| 85 |
+
from openenv.core.env_server.types import Action, Observation, State
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
create_app = openenv_create_app
|
| 89 |
+
|
pytest-cache-files-1f62ra1g/container_sim/server/env.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
from .env_safe import * # noqa: F401,F403
|
pytest-cache-files-1f62ra1g/container_sim/server/env_safe.py
ADDED
|
@@ -0,0 +1,505 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Safe OpenEnv environment for deterministic Python code repair tasks."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Any, Optional
|
| 6 |
+
from uuid import uuid4
|
| 7 |
+
|
| 8 |
+
try:
|
| 9 |
+
from compat import Environment
|
| 10 |
+
from graders import grade_task
|
| 11 |
+
from models import (
|
| 12 |
+
HealthResponse,
|
| 13 |
+
HistoryEntry,
|
| 14 |
+
PythonCodeReviewAction,
|
| 15 |
+
PythonCodeReviewObservation,
|
| 16 |
+
PythonCodeReviewState,
|
| 17 |
+
RewardDetails,
|
| 18 |
+
TaskGrade,
|
| 19 |
+
)
|
| 20 |
+
from tasks import TaskSpec, get_task as load_task, list_task_summaries, task_ids
|
| 21 |
+
except Exception:
|
| 22 |
+
from .compat import Environment
|
| 23 |
+
from .graders import grade_task
|
| 24 |
+
from .models import (
|
| 25 |
+
HealthResponse,
|
| 26 |
+
HistoryEntry,
|
| 27 |
+
PythonCodeReviewAction,
|
| 28 |
+
PythonCodeReviewObservation,
|
| 29 |
+
PythonCodeReviewState,
|
| 30 |
+
RewardDetails,
|
| 31 |
+
TaskGrade,
|
| 32 |
+
)
|
| 33 |
+
from .tasks import TaskSpec, get_task as load_task, list_task_summaries, task_ids
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
INVALID_ACTION_PENALTY = 0.10
|
| 37 |
+
NO_PROGRESS_PENALTY = 0.08
|
| 38 |
+
REPEATED_ACTION_PENALTY = 0.05
|
| 39 |
+
BASE_STEP_PENALTY = 0.02
|
| 40 |
+
ANALYZE_STEP_PENALTY = 0.01
|
| 41 |
+
SUBMIT_COMPLETION_BONUS = 0.30
|
| 42 |
+
TIMEOUT_PENALTY = 0.12
|
| 43 |
+
VALID_ACTIONS = {"analyze_code", "edit_code", "run_tests", "submit_solution"}
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def _clamp(value: float, low: float = 0.0, high: float = 1.0) -> float:
|
| 47 |
+
"""Clamp a scalar to a bounded numeric interval."""
|
| 48 |
+
try:
|
| 49 |
+
return max(low, min(high, float(value)))
|
| 50 |
+
except Exception:
|
| 51 |
+
return low
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def _safe_text(value: Any, default: str = "") -> str:
|
| 55 |
+
"""Convert values into short stable strings."""
|
| 56 |
+
try:
|
| 57 |
+
text = str(value)
|
| 58 |
+
except Exception:
|
| 59 |
+
return default
|
| 60 |
+
text = " ".join(text.split())
|
| 61 |
+
return text[:240] if text else default
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
class PythonCodeReviewEnvironment(
|
| 65 |
+
Environment[PythonCodeReviewAction, PythonCodeReviewObservation, PythonCodeReviewState]
|
| 66 |
+
):
|
| 67 |
+
"""Deterministic, bounded, evaluator-safe environment for code repair tasks."""
|
| 68 |
+
|
| 69 |
+
SUPPORTS_CONCURRENT_SESSIONS = True
|
| 70 |
+
|
| 71 |
+
def __init__(self, verbose: bool = False) -> None:
|
| 72 |
+
super().__init__()
|
| 73 |
+
self._verbose = bool(verbose)
|
| 74 |
+
self._task_order = self._safe_task_order()
|
| 75 |
+
self._task_cursor = -1
|
| 76 |
+
self._task: Optional[TaskSpec] = None
|
| 77 |
+
self._state = PythonCodeReviewState(episode_id=str(uuid4()))
|
| 78 |
+
self._done = False
|
| 79 |
+
self._last_status = "Call reset() to start."
|
| 80 |
+
self._last_reward = RewardDetails(value=0.0, reason="Environment initialized.")
|
| 81 |
+
self._metrics = self._blank_metrics()
|
| 82 |
+
self._last_action_type = ""
|
| 83 |
+
|
| 84 |
+
def reset(
|
| 85 |
+
self,
|
| 86 |
+
seed: Optional[int] = None,
|
| 87 |
+
episode_id: Optional[str] = None,
|
| 88 |
+
task_id: Optional[str] = None,
|
| 89 |
+
**_: object,
|
| 90 |
+
) -> PythonCodeReviewObservation:
|
| 91 |
+
"""Reset the environment for a deterministic task and return an observation."""
|
| 92 |
+
del seed
|
| 93 |
+
try:
|
| 94 |
+
self._reset_rubric()
|
| 95 |
+
except Exception:
|
| 96 |
+
pass
|
| 97 |
+
|
| 98 |
+
task = self._select_task(task_id)
|
| 99 |
+
self._task = task
|
| 100 |
+
self._done = False
|
| 101 |
+
self._metrics = self._blank_metrics()
|
| 102 |
+
self._last_action_type = ""
|
| 103 |
+
self._last_status = "Inspect the code, run checks, edit the code, then submit."
|
| 104 |
+
self._last_reward = RewardDetails(
|
| 105 |
+
value=0.0,
|
| 106 |
+
reason="Episode reset.",
|
| 107 |
+
prev_score=0.0,
|
| 108 |
+
curr_score=0.0,
|
| 109 |
+
)
|
| 110 |
+
self._state = PythonCodeReviewState(
|
| 111 |
+
episode_id=episode_id or str(uuid4()),
|
| 112 |
+
step_count=0,
|
| 113 |
+
task_id=task.task_id,
|
| 114 |
+
difficulty=task.difficulty,
|
| 115 |
+
task_kind=task.task_kind,
|
| 116 |
+
attempts_remaining=max(int(task.max_steps), 1),
|
| 117 |
+
current_code=task.starter_code,
|
| 118 |
+
errors="",
|
| 119 |
+
test_results="No checks run yet.",
|
| 120 |
+
history=[],
|
| 121 |
+
score=0.0,
|
| 122 |
+
done=False,
|
| 123 |
+
)
|
| 124 |
+
return self._build_observation()
|
| 125 |
+
|
| 126 |
+
def step(
|
| 127 |
+
self,
|
| 128 |
+
action: PythonCodeReviewAction,
|
| 129 |
+
timeout_s: Optional[float] = None,
|
| 130 |
+
**_: object,
|
| 131 |
+
) -> PythonCodeReviewObservation:
|
| 132 |
+
"""Execute one safe environment step and always return a valid observation."""
|
| 133 |
+
del timeout_s
|
| 134 |
+
try:
|
| 135 |
+
if self._task is None:
|
| 136 |
+
return self.reset()
|
| 137 |
+
|
| 138 |
+
if self._done:
|
| 139 |
+
self._last_status = "Episode already completed. Call reset() to continue."
|
| 140 |
+
self._last_reward = RewardDetails(
|
| 141 |
+
value=-INVALID_ACTION_PENALTY,
|
| 142 |
+
invalid_action_penalty=INVALID_ACTION_PENALTY,
|
| 143 |
+
reason="Episode already completed.",
|
| 144 |
+
prev_score=self._metrics["score"],
|
| 145 |
+
curr_score=self._metrics["score"],
|
| 146 |
+
code_changed=False,
|
| 147 |
+
)
|
| 148 |
+
return self._build_observation()
|
| 149 |
+
|
| 150 |
+
self._state.step_count += 1
|
| 151 |
+
action_type = _safe_text(getattr(action, "action_type", "analyze_code"), "analyze_code")
|
| 152 |
+
code = getattr(action, "code", None)
|
| 153 |
+
|
| 154 |
+
if action_type == "analyze_code":
|
| 155 |
+
self._handle_scored_action(action_type=action_type, candidate_code=self._state.current_code, include_hidden=False)
|
| 156 |
+
elif action_type == "run_tests":
|
| 157 |
+
self._handle_scored_action(action_type=action_type, candidate_code=self._state.current_code, include_hidden=False)
|
| 158 |
+
elif action_type == "edit_code":
|
| 159 |
+
self._handle_edit(code)
|
| 160 |
+
elif action_type == "submit_solution":
|
| 161 |
+
self._handle_scored_action(action_type=action_type, candidate_code=self._state.current_code, include_hidden=True)
|
| 162 |
+
self._done = True
|
| 163 |
+
else:
|
| 164 |
+
self._apply_invalid_action(f"Unsupported action_type '{action_type}'.")
|
| 165 |
+
|
| 166 |
+
self._state.attempts_remaining = max(self._task.max_steps - self._state.step_count, 0)
|
| 167 |
+
if self._state.attempts_remaining == 0 and not self._done:
|
| 168 |
+
self._auto_submit()
|
| 169 |
+
|
| 170 |
+
self._state.done = self._done
|
| 171 |
+
return self._build_observation()
|
| 172 |
+
except Exception as exc:
|
| 173 |
+
self._apply_invalid_action(f"Step failure handled: {_safe_text(exc, 'unknown_error')}")
|
| 174 |
+
self._state.done = self._done
|
| 175 |
+
return self._build_observation()
|
| 176 |
+
|
| 177 |
+
@property
|
| 178 |
+
def state(self) -> PythonCodeReviewState:
|
| 179 |
+
"""Return a deep copy of the current environment state."""
|
| 180 |
+
try:
|
| 181 |
+
return self._state.model_copy(deep=True)
|
| 182 |
+
except Exception:
|
| 183 |
+
return PythonCodeReviewState(episode_id=str(uuid4()))
|
| 184 |
+
|
| 185 |
+
def list_task_summaries(self) -> list[object]:
|
| 186 |
+
"""Return public task summaries."""
|
| 187 |
+
try:
|
| 188 |
+
return list_task_summaries()
|
| 189 |
+
except Exception:
|
| 190 |
+
return []
|
| 191 |
+
|
| 192 |
+
def get_task(self, task_id: str) -> object:
|
| 193 |
+
"""Return a single public task descriptor."""
|
| 194 |
+
return self._select_task(task_id).to_descriptor()
|
| 195 |
+
|
| 196 |
+
def health(self) -> HealthResponse:
|
| 197 |
+
"""Return a simple health response."""
|
| 198 |
+
return HealthResponse(task_count=len(self._task_order))
|
| 199 |
+
|
| 200 |
+
def grade_task_submission(self, task_id: str, code: str) -> TaskGrade:
|
| 201 |
+
"""Grade a task submission outside an episode without raising."""
|
| 202 |
+
try:
|
| 203 |
+
task = self._select_task(task_id)
|
| 204 |
+
return self._safe_grade(task=task, candidate_code=code, include_hidden=True)
|
| 205 |
+
except Exception as exc:
|
| 206 |
+
return TaskGrade(score=0.0, details={"error": _safe_text(exc, "grading_failed")})
|
| 207 |
+
|
| 208 |
+
def run_tests(self, code: str, include_hidden: bool = False) -> tuple[float, dict[str, int], TaskGrade]:
|
| 209 |
+
"""Run deterministic grading and return score plus test summary."""
|
| 210 |
+
task = self._task or self._select_task(None)
|
| 211 |
+
grade = self._safe_grade(task=task, candidate_code=code, include_hidden=include_hidden)
|
| 212 |
+
return (
|
| 213 |
+
_clamp(grade.score),
|
| 214 |
+
{"passed": int(grade.tests_passed), "total": int(grade.tests_total)},
|
| 215 |
+
grade,
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
+
def apply_action(self, action: PythonCodeReviewAction) -> str:
|
| 219 |
+
"""Return the candidate code implied by the action."""
|
| 220 |
+
if getattr(action, "action_type", "") == "edit_code":
|
| 221 |
+
code = getattr(action, "code", None)
|
| 222 |
+
return str(code) if code is not None else self._state.current_code
|
| 223 |
+
return self._state.current_code
|
| 224 |
+
|
| 225 |
+
def compute_reward(
|
| 226 |
+
self,
|
| 227 |
+
action_type: str,
|
| 228 |
+
previous_metrics: dict[str, float],
|
| 229 |
+
current_metrics: dict[str, float],
|
| 230 |
+
grade: TaskGrade,
|
| 231 |
+
code_changed: bool,
|
| 232 |
+
invalid_action: bool = False,
|
| 233 |
+
) -> RewardDetails:
|
| 234 |
+
"""Compute a bounded dynamic reward with progress and efficiency shaping."""
|
| 235 |
+
prev_score = _clamp(previous_metrics.get("score", 0.0))
|
| 236 |
+
curr_score = _clamp(current_metrics.get("score", 0.0))
|
| 237 |
+
score_delta = curr_score - prev_score
|
| 238 |
+
test_delta = current_metrics.get("test_fraction", 0.0) - previous_metrics.get("test_fraction", 0.0)
|
| 239 |
+
syntax_delta = current_metrics.get("syntax_score", 0.0) - previous_metrics.get("syntax_score", 0.0)
|
| 240 |
+
quality_delta = current_metrics.get("quality_score", 0.0) - previous_metrics.get("quality_score", 0.0)
|
| 241 |
+
|
| 242 |
+
step_penalty = BASE_STEP_PENALTY + (ANALYZE_STEP_PENALTY if action_type == "analyze_code" else 0.0)
|
| 243 |
+
repeated_penalty = REPEATED_ACTION_PENALTY if action_type == self._last_action_type else 0.0
|
| 244 |
+
no_progress = (
|
| 245 |
+
score_delta <= 1e-9
|
| 246 |
+
and test_delta <= 1e-9
|
| 247 |
+
and syntax_delta <= 1e-9
|
| 248 |
+
and quality_delta <= 1e-9
|
| 249 |
+
and not code_changed
|
| 250 |
+
)
|
| 251 |
+
stagnation_penalty = NO_PROGRESS_PENALTY if no_progress and not invalid_action else 0.0
|
| 252 |
+
regression_penalty = max(-score_delta, 0.0) * 0.6 + repeated_penalty + step_penalty
|
| 253 |
+
invalid_penalty = INVALID_ACTION_PENALTY if invalid_action else 0.0
|
| 254 |
+
timeout_penalty = TIMEOUT_PENALTY if bool(grade.timed_out) else 0.0
|
| 255 |
+
|
| 256 |
+
progress_reward = max(score_delta, 0.0) * 0.7
|
| 257 |
+
syntax_reward = max(syntax_delta, 0.0) * 0.5
|
| 258 |
+
test_reward = max(test_delta, 0.0) * 1.0
|
| 259 |
+
quality_bonus = max(quality_delta, 0.0) * 0.2
|
| 260 |
+
correctness_bonus = SUBMIT_COMPLETION_BONUS if action_type == "submit_solution" and curr_score >= 0.999 else 0.0
|
| 261 |
+
|
| 262 |
+
reward_value = (
|
| 263 |
+
progress_reward
|
| 264 |
+
+ syntax_reward
|
| 265 |
+
+ test_reward
|
| 266 |
+
+ quality_bonus
|
| 267 |
+
+ correctness_bonus
|
| 268 |
+
- stagnation_penalty
|
| 269 |
+
- regression_penalty
|
| 270 |
+
- invalid_penalty
|
| 271 |
+
- timeout_penalty
|
| 272 |
+
)
|
| 273 |
+
reward_value = max(-1.0, min(1.0, round(reward_value, 6)))
|
| 274 |
+
return RewardDetails(
|
| 275 |
+
value=reward_value,
|
| 276 |
+
syntax_reward=round(syntax_reward, 6),
|
| 277 |
+
test_reward=round(test_reward, 6),
|
| 278 |
+
quality_bonus=round(quality_bonus, 6),
|
| 279 |
+
correctness_bonus=round(correctness_bonus, 6),
|
| 280 |
+
progress_delta=round(progress_reward, 6),
|
| 281 |
+
stagnation_penalty=round(stagnation_penalty, 6),
|
| 282 |
+
regression_penalty=round(regression_penalty, 6),
|
| 283 |
+
invalid_action_penalty=round(invalid_penalty, 6),
|
| 284 |
+
timeout_penalty=round(timeout_penalty, 6),
|
| 285 |
+
reason=f"{action_type} reward computed safely",
|
| 286 |
+
prev_score=round(prev_score, 6),
|
| 287 |
+
curr_score=round(curr_score, 6),
|
| 288 |
+
code_changed=bool(code_changed),
|
| 289 |
+
)
|
| 290 |
+
|
| 291 |
+
def _safe_task_order(self) -> list[str]:
|
| 292 |
+
"""Load deterministic task ids with a hard fallback."""
|
| 293 |
+
try:
|
| 294 |
+
loaded = list(task_ids())
|
| 295 |
+
if loaded:
|
| 296 |
+
return [str(task_id) for task_id in loaded]
|
| 297 |
+
except Exception:
|
| 298 |
+
pass
|
| 299 |
+
return ["syntax-fix-easy", "bug-fix-medium", "optimization-hard"]
|
| 300 |
+
|
| 301 |
+
def _blank_metrics(self) -> dict[str, float]:
|
| 302 |
+
"""Return an empty metric snapshot."""
|
| 303 |
+
return {
|
| 304 |
+
"score": 0.0,
|
| 305 |
+
"test_fraction": 0.0,
|
| 306 |
+
"syntax_score": 0.0,
|
| 307 |
+
"quality_score": 0.0,
|
| 308 |
+
}
|
| 309 |
+
|
| 310 |
+
def _select_task(self, task_id: Optional[str]) -> TaskSpec:
|
| 311 |
+
"""Select the requested task or advance deterministically."""
|
| 312 |
+
try:
|
| 313 |
+
if task_id:
|
| 314 |
+
task = load_task(task_id)
|
| 315 |
+
if task.task_id in self._task_order:
|
| 316 |
+
self._task_cursor = self._task_order.index(task.task_id)
|
| 317 |
+
return task
|
| 318 |
+
except Exception:
|
| 319 |
+
pass
|
| 320 |
+
|
| 321 |
+
try:
|
| 322 |
+
self._task_cursor = (self._task_cursor + 1) % len(self._task_order)
|
| 323 |
+
return load_task(self._task_order[self._task_cursor])
|
| 324 |
+
except Exception:
|
| 325 |
+
return load_task("syntax-fix-easy")
|
| 326 |
+
|
| 327 |
+
def _safe_grade(self, task: TaskSpec, candidate_code: str, include_hidden: bool) -> TaskGrade:
|
| 328 |
+
"""Run grading without allowing exceptions to escape."""
|
| 329 |
+
try:
|
| 330 |
+
return grade_task(candidate_code, task, include_hidden=include_hidden)
|
| 331 |
+
except Exception as exc:
|
| 332 |
+
return TaskGrade(
|
| 333 |
+
score=0.0,
|
| 334 |
+
syntax_score=0.0,
|
| 335 |
+
tests_passed=0,
|
| 336 |
+
tests_total=max(len(task.visible_tests), 1),
|
| 337 |
+
details={"compile_error": "", "error": _safe_text(exc, "grading_failed")},
|
| 338 |
+
)
|
| 339 |
+
|
| 340 |
+
def _metrics_from_grade(self, grade: TaskGrade) -> dict[str, float]:
|
| 341 |
+
"""Derive normalized reward metrics from a grading result."""
|
| 342 |
+
tests_total = max(int(grade.tests_total), 0)
|
| 343 |
+
tests_passed = max(int(grade.tests_passed), 0)
|
| 344 |
+
test_fraction = (tests_passed / tests_total) if tests_total else _clamp(grade.syntax_score)
|
| 345 |
+
return {
|
| 346 |
+
"score": _clamp(grade.score),
|
| 347 |
+
"test_fraction": _clamp(test_fraction),
|
| 348 |
+
"syntax_score": _clamp(grade.syntax_score),
|
| 349 |
+
"quality_score": _clamp(grade.quality_score),
|
| 350 |
+
}
|
| 351 |
+
|
| 352 |
+
def _format_test_results(self, grade: TaskGrade, include_hidden: bool) -> str:
|
| 353 |
+
"""Format test execution results for the observation."""
|
| 354 |
+
compile_error = _safe_text(grade.details.get("compile_error", ""), "")
|
| 355 |
+
scope = "all checks" if include_hidden else "visible checks"
|
| 356 |
+
if compile_error:
|
| 357 |
+
return f"{scope}: compile error: {compile_error}"
|
| 358 |
+
if grade.timed_out:
|
| 359 |
+
return f"{scope}: execution timed out"
|
| 360 |
+
if self._task and self._task.task_kind == "syntax_fix":
|
| 361 |
+
return "visible checks: code compiles successfully"
|
| 362 |
+
return f"{scope}: {int(grade.tests_passed)}/{int(grade.tests_total)} passing"
|
| 363 |
+
|
| 364 |
+
def _build_status(self, action_type: str, grade: TaskGrade) -> str:
|
| 365 |
+
"""Build a human-readable status message."""
|
| 366 |
+
if action_type == "submit_solution":
|
| 367 |
+
return f"Solution submitted. Final score: {_clamp(grade.score):.3f}"
|
| 368 |
+
if action_type == "edit_code":
|
| 369 |
+
if grade.details.get("compile_error"):
|
| 370 |
+
return "Code updated, but syntax issues remain."
|
| 371 |
+
return "Code updated and evaluated."
|
| 372 |
+
if action_type == "run_tests":
|
| 373 |
+
return "Test run completed."
|
| 374 |
+
if action_type == "analyze_code":
|
| 375 |
+
return "Analysis completed."
|
| 376 |
+
return "Action handled safely."
|
| 377 |
+
|
| 378 |
+
def _apply_grade_to_state(self, grade: TaskGrade, include_hidden: bool) -> None:
|
| 379 |
+
"""Update environment state from the latest grading result."""
|
| 380 |
+
compile_error = _safe_text(grade.details.get("compile_error", ""), "")
|
| 381 |
+
self._state.score = _clamp(grade.score)
|
| 382 |
+
self._state.errors = compile_error
|
| 383 |
+
self._state.test_results = self._format_test_results(grade, include_hidden=include_hidden)
|
| 384 |
+
|
| 385 |
+
def _handle_scored_action(self, action_type: str, candidate_code: str, include_hidden: bool) -> None:
|
| 386 |
+
"""Grade code, update state, and compute reward for a valid action."""
|
| 387 |
+
task = self._task or self._select_task(None)
|
| 388 |
+
previous_metrics = dict(self._metrics)
|
| 389 |
+
prior_code = self._state.current_code
|
| 390 |
+
code_changed = candidate_code.strip() != prior_code.strip()
|
| 391 |
+
if action_type == "edit_code":
|
| 392 |
+
self._state.current_code = candidate_code
|
| 393 |
+
grade = self._safe_grade(task=task, candidate_code=self._state.current_code, include_hidden=include_hidden)
|
| 394 |
+
current_metrics = self._metrics_from_grade(grade)
|
| 395 |
+
self._apply_grade_to_state(grade, include_hidden=include_hidden)
|
| 396 |
+
self._last_reward = self.compute_reward(
|
| 397 |
+
action_type=action_type,
|
| 398 |
+
previous_metrics=previous_metrics,
|
| 399 |
+
current_metrics=current_metrics,
|
| 400 |
+
grade=grade,
|
| 401 |
+
code_changed=code_changed,
|
| 402 |
+
invalid_action=False,
|
| 403 |
+
)
|
| 404 |
+
self._last_status = self._build_status(action_type, grade)
|
| 405 |
+
self._metrics = current_metrics
|
| 406 |
+
self._last_action_type = action_type
|
| 407 |
+
self._append_history(action_type, self._last_status, self._last_reward.value)
|
| 408 |
+
|
| 409 |
+
def _handle_edit(self, code: Optional[str]) -> None:
|
| 410 |
+
"""Validate edit input and evaluate the new candidate code."""
|
| 411 |
+
safe_code = (code or "").strip()
|
| 412 |
+
if not safe_code:
|
| 413 |
+
self._apply_invalid_action("edit_code requires code parameter.")
|
| 414 |
+
return
|
| 415 |
+
self._handle_scored_action(action_type="edit_code", candidate_code=safe_code, include_hidden=False)
|
| 416 |
+
|
| 417 |
+
def _apply_invalid_action(self, reason: str) -> None:
|
| 418 |
+
"""Record an invalid action without crashing the episode."""
|
| 419 |
+
previous_metrics = dict(self._metrics)
|
| 420 |
+
grade = TaskGrade(score=previous_metrics["score"], syntax_score=previous_metrics["syntax_score"])
|
| 421 |
+
self._last_reward = self.compute_reward(
|
| 422 |
+
action_type="invalid",
|
| 423 |
+
previous_metrics=previous_metrics,
|
| 424 |
+
current_metrics=previous_metrics,
|
| 425 |
+
grade=grade,
|
| 426 |
+
code_changed=False,
|
| 427 |
+
invalid_action=True,
|
| 428 |
+
)
|
| 429 |
+
self._last_status = reason
|
| 430 |
+
self._append_history("analyze_code", reason, self._last_reward.value)
|
| 431 |
+
|
| 432 |
+
def _auto_submit(self) -> None:
|
| 433 |
+
"""Finalize the episode when attempts are exhausted."""
|
| 434 |
+
task = self._task or self._select_task(None)
|
| 435 |
+
grade = self._safe_grade(task=task, candidate_code=self._state.current_code, include_hidden=True)
|
| 436 |
+
self._apply_grade_to_state(grade, include_hidden=True)
|
| 437 |
+
self._done = True
|
| 438 |
+
self._state.done = True
|
| 439 |
+
self._last_status = f"Auto-submitted. Final score: {_clamp(grade.score):.3f}"
|
| 440 |
+
|
| 441 |
+
def _append_history(self, action_type: str, status: str, reward: float) -> None:
|
| 442 |
+
"""Append one action record to the episode history."""
|
| 443 |
+
try:
|
| 444 |
+
stable_action = action_type if action_type in VALID_ACTIONS else "analyze_code"
|
| 445 |
+
self._state.history.append(
|
| 446 |
+
HistoryEntry(
|
| 447 |
+
step=max(int(self._state.step_count), 0),
|
| 448 |
+
action_type=stable_action,
|
| 449 |
+
status=_safe_text(status, "handled"),
|
| 450 |
+
reward=float(reward),
|
| 451 |
+
)
|
| 452 |
+
)
|
| 453 |
+
except Exception:
|
| 454 |
+
pass
|
| 455 |
+
|
| 456 |
+
def _build_observation(self) -> PythonCodeReviewObservation:
|
| 457 |
+
"""Build a valid observation from current state."""
|
| 458 |
+
task = self._task
|
| 459 |
+
try:
|
| 460 |
+
return PythonCodeReviewObservation(
|
| 461 |
+
task_id=self._state.task_id or "",
|
| 462 |
+
title=task.title if task else "",
|
| 463 |
+
difficulty=self._state.difficulty or "easy",
|
| 464 |
+
task_kind=self._state.task_kind,
|
| 465 |
+
task_description=task.task_description if task else "",
|
| 466 |
+
current_code=self._state.current_code,
|
| 467 |
+
errors=self._state.errors,
|
| 468 |
+
test_results=self._state.test_results,
|
| 469 |
+
visible_tests=list(task.visible_tests) if task else [],
|
| 470 |
+
history=list(self._state.history),
|
| 471 |
+
attempts_remaining=max(int(self._state.attempts_remaining), 0),
|
| 472 |
+
last_action_status=self._last_status,
|
| 473 |
+
score=_clamp(self._state.score),
|
| 474 |
+
reward_details=self._last_reward,
|
| 475 |
+
reward=self._last_reward.value,
|
| 476 |
+
done=bool(self._state.done),
|
| 477 |
+
metadata={
|
| 478 |
+
"prev_score": self._last_reward.prev_score,
|
| 479 |
+
"curr_score": self._last_reward.curr_score,
|
| 480 |
+
},
|
| 481 |
+
)
|
| 482 |
+
except Exception as exc:
|
| 483 |
+
return PythonCodeReviewObservation(
|
| 484 |
+
task_id=self._state.task_id or "",
|
| 485 |
+
title="",
|
| 486 |
+
difficulty="easy",
|
| 487 |
+
task_kind=None,
|
| 488 |
+
task_description="",
|
| 489 |
+
current_code=getattr(self._state, "current_code", ""),
|
| 490 |
+
errors=_safe_text(exc, "observation_build_failed"),
|
| 491 |
+
test_results="visible checks: unavailable",
|
| 492 |
+
visible_tests=[],
|
| 493 |
+
history=[],
|
| 494 |
+
attempts_remaining=0,
|
| 495 |
+
last_action_status="Observation fallback returned safely.",
|
| 496 |
+
score=0.0,
|
| 497 |
+
reward_details=RewardDetails(value=0.0, reason="Observation fallback."),
|
| 498 |
+
reward=0.0,
|
| 499 |
+
done=bool(getattr(self._state, "done", False)),
|
| 500 |
+
metadata={},
|
| 501 |
+
)
|
| 502 |
+
|
| 503 |
+
|
| 504 |
+
PythonEnvironment = PythonCodeReviewEnvironment
|
| 505 |
+
CodeReviewEnvironment = PythonCodeReviewEnvironment
|
pytest-cache-files-1f62ra1g/container_sim/server/graders/__init__.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Deterministic graders for self-contained server builds."""
|
| 2 |
+
|
| 3 |
+
from .common import clamp_score
|
| 4 |
+
from .optimization import grade_optimization_task
|
| 5 |
+
from .pytest_runner import PytestExecution, run_pytest_suite
|
| 6 |
+
from .syntax import grade_bug_fix_task, grade_syntax_task, grade_task
|
| 7 |
+
|
| 8 |
+
__all__ = [
|
| 9 |
+
"PytestExecution",
|
| 10 |
+
"clamp_score",
|
| 11 |
+
"grade_bug_fix_task",
|
| 12 |
+
"grade_optimization_task",
|
| 13 |
+
"grade_syntax_task",
|
| 14 |
+
"grade_task",
|
| 15 |
+
"run_pytest_suite",
|
| 16 |
+
]
|
| 17 |
+
|
pytest-cache-files-1f62ra1g/container_sim/server/graders/common.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Shared deterministic scoring helpers."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import ast
|
| 6 |
+
import difflib
|
| 7 |
+
import traceback
|
| 8 |
+
from typing import Tuple
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def clamp_score(value: float) -> float:
|
| 12 |
+
return max(0.0, min(1.0, round(value, 6)))
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def syntax_error_message(code: str) -> str:
|
| 16 |
+
try:
|
| 17 |
+
ast.parse(code)
|
| 18 |
+
except SyntaxError as exc:
|
| 19 |
+
return f"{exc.msg} (line {exc.lineno}, column {exc.offset})"
|
| 20 |
+
except Exception:
|
| 21 |
+
return traceback.format_exc(limit=1).strip()
|
| 22 |
+
return ""
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def compiles(code: str) -> bool:
|
| 26 |
+
try:
|
| 27 |
+
compile(code, "<candidate>", "exec")
|
| 28 |
+
except Exception:
|
| 29 |
+
return False
|
| 30 |
+
return True
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def normalized_diff_score(code: str, reference_code: str) -> float:
|
| 34 |
+
ratio = difflib.SequenceMatcher(
|
| 35 |
+
a="".join(code.split()),
|
| 36 |
+
b="".join(reference_code.split()),
|
| 37 |
+
).ratio()
|
| 38 |
+
return clamp_score(ratio)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def style_score(code: str, max_line_length: int = 88) -> float:
|
| 42 |
+
lines = code.splitlines() or [""]
|
| 43 |
+
line_length_ok = sum(1 for line in lines if len(line) <= max_line_length) / len(lines)
|
| 44 |
+
tab_ok = 1.0 if all("\t" not in line for line in lines) else 0.0
|
| 45 |
+
trailing_ws_ok = 1.0 if all(line == line.rstrip() for line in lines) else 0.0
|
| 46 |
+
return clamp_score((line_length_ok * 0.6) + (tab_ok * 0.2) + (trailing_ws_ok * 0.2))
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def nested_loop_depth(tree: ast.AST) -> int:
|
| 50 |
+
best = 0
|
| 51 |
+
|
| 52 |
+
def walk(node: ast.AST, depth: int) -> None:
|
| 53 |
+
nonlocal best
|
| 54 |
+
if isinstance(node, (ast.For, ast.AsyncFor, ast.While)):
|
| 55 |
+
depth += 1
|
| 56 |
+
best = max(best, depth)
|
| 57 |
+
for child in ast.iter_child_nodes(node):
|
| 58 |
+
walk(child, depth)
|
| 59 |
+
|
| 60 |
+
walk(tree, 0)
|
| 61 |
+
return best
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def compile_tree(code: str) -> Tuple[ast.AST | None, str]:
|
| 65 |
+
try:
|
| 66 |
+
return ast.parse(code), ""
|
| 67 |
+
except SyntaxError as exc:
|
| 68 |
+
return None, f"{exc.msg} (line {exc.lineno}, column {exc.offset})"
|
| 69 |
+
|
pytest-cache-files-1f62ra1g/container_sim/server/graders/optimization.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Deterministic grading for optimization tasks."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
import subprocess
|
| 7 |
+
import sys
|
| 8 |
+
import tempfile
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
from .common import clamp_score, compile_tree, nested_loop_depth, style_score
|
| 12 |
+
from .pytest_runner import run_pytest_suite
|
| 13 |
+
from ..models import TaskGrade
|
| 14 |
+
from ..tasks.task_bank import TaskSpec
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def _benchmark_script(task: TaskSpec) -> str:
|
| 18 |
+
return f"""import json
|
| 19 |
+
import time
|
| 20 |
+
from candidate import {task.benchmark_entrypoint}
|
| 21 |
+
|
| 22 |
+
{task.benchmark_builder}
|
| 23 |
+
|
| 24 |
+
events = build_benchmark_events()
|
| 25 |
+
start = time.perf_counter()
|
| 26 |
+
for _ in range({task.benchmark_repeats}):
|
| 27 |
+
result = {task.benchmark_entrypoint}(events)
|
| 28 |
+
elapsed = time.perf_counter() - start
|
| 29 |
+
Path = __import__("pathlib").Path
|
| 30 |
+
Path("benchmark.json").write_text(json.dumps({{"elapsed": elapsed, "rows": len(result)}}), encoding="utf-8")
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def benchmark_runtime(candidate_code: str, task: TaskSpec) -> tuple[float, bool, str]:
|
| 35 |
+
assert task.benchmark_entrypoint is not None
|
| 36 |
+
try:
|
| 37 |
+
with tempfile.TemporaryDirectory(prefix="python-code-review-bench-") as temp_dir:
|
| 38 |
+
temp_path = Path(temp_dir)
|
| 39 |
+
(temp_path / "candidate.py").write_text(candidate_code, encoding="utf-8")
|
| 40 |
+
(temp_path / "starter.py").write_text(task.starter_code, encoding="utf-8")
|
| 41 |
+
(temp_path / "candidate_runner.py").write_text(_benchmark_script(task), encoding="utf-8")
|
| 42 |
+
starter_script = _benchmark_script(task).replace("from candidate import", "from starter import")
|
| 43 |
+
(temp_path / "starter_runner.py").write_text(starter_script, encoding="utf-8")
|
| 44 |
+
|
| 45 |
+
try:
|
| 46 |
+
starter_run = subprocess.run(
|
| 47 |
+
[sys.executable, "starter_runner.py"],
|
| 48 |
+
cwd=temp_path,
|
| 49 |
+
capture_output=True,
|
| 50 |
+
text=True,
|
| 51 |
+
timeout=task.benchmark_timeout_s,
|
| 52 |
+
check=False,
|
| 53 |
+
)
|
| 54 |
+
starter_payload = json.loads((temp_path / "benchmark.json").read_text(encoding="utf-8"))
|
| 55 |
+
candidate_run = subprocess.run(
|
| 56 |
+
[sys.executable, "candidate_runner.py"],
|
| 57 |
+
cwd=temp_path,
|
| 58 |
+
capture_output=True,
|
| 59 |
+
text=True,
|
| 60 |
+
timeout=task.benchmark_timeout_s,
|
| 61 |
+
check=False,
|
| 62 |
+
)
|
| 63 |
+
candidate_payload = json.loads((temp_path / "benchmark.json").read_text(encoding="utf-8"))
|
| 64 |
+
except subprocess.TimeoutExpired as exc:
|
| 65 |
+
output = (exc.stdout or "") + (exc.stderr or "")
|
| 66 |
+
return 0.0, True, (output or "benchmark timed out").strip()
|
| 67 |
+
except Exception as exc:
|
| 68 |
+
return 0.0, False, str(exc)
|
| 69 |
+
|
| 70 |
+
starter_elapsed = max(float(starter_payload["elapsed"]), 1e-9)
|
| 71 |
+
candidate_elapsed = max(float(candidate_payload["elapsed"]), 1e-9)
|
| 72 |
+
speedup = starter_elapsed / candidate_elapsed
|
| 73 |
+
runtime_score = clamp_score(min((speedup - 1.0) / 3.0, 1.0))
|
| 74 |
+
output = "\n".join(
|
| 75 |
+
part
|
| 76 |
+
for part in [
|
| 77 |
+
starter_run.stdout.strip(),
|
| 78 |
+
starter_run.stderr.strip(),
|
| 79 |
+
candidate_run.stdout.strip(),
|
| 80 |
+
candidate_run.stderr.strip(),
|
| 81 |
+
f"starter={starter_elapsed:.6f}s candidate={candidate_elapsed:.6f}s speedup={speedup:.2f}x",
|
| 82 |
+
]
|
| 83 |
+
if part
|
| 84 |
+
)
|
| 85 |
+
return runtime_score, False, output
|
| 86 |
+
except Exception as exc:
|
| 87 |
+
return 0.0, False, str(exc)
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def ast_quality_score(code: str, task: TaskSpec) -> float:
|
| 91 |
+
tree, _ = compile_tree(code)
|
| 92 |
+
if tree is None:
|
| 93 |
+
return 0.0
|
| 94 |
+
import ast
|
| 95 |
+
|
| 96 |
+
function_node = next((node for node in tree.body if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef))), None)
|
| 97 |
+
docstring_points = 0.2 if function_node and ast.get_docstring(function_node, clean=False) else 0.0
|
| 98 |
+
nested_points = 0.4 if nested_loop_depth(tree) <= 1 else 0.0
|
| 99 |
+
marker_points = 0.0
|
| 100 |
+
for marker in task.expected_quality_markers:
|
| 101 |
+
if marker in code:
|
| 102 |
+
marker_points += 0.2
|
| 103 |
+
return clamp_score(docstring_points + nested_points + marker_points)
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def grade_optimization_task(candidate_code: str, task: TaskSpec) -> TaskGrade:
|
| 107 |
+
execution = run_pytest_suite(candidate_code, [*task.visible_tests, *task.hidden_tests], timeout_s=task.benchmark_timeout_s)
|
| 108 |
+
test_fraction = execution.passed / execution.total if execution.total else 0.0
|
| 109 |
+
|
| 110 |
+
if execution.timed_out:
|
| 111 |
+
return TaskGrade(score=0.0, tests_passed=execution.passed, tests_total=execution.total, timed_out=True, details={"tests": execution.output})
|
| 112 |
+
|
| 113 |
+
runtime_score, timed_out, benchmark_output = benchmark_runtime(candidate_code, task)
|
| 114 |
+
if timed_out:
|
| 115 |
+
return TaskGrade(score=0.0, tests_passed=execution.passed, tests_total=execution.total, timed_out=True, details={"tests": execution.output, "benchmark": benchmark_output})
|
| 116 |
+
|
| 117 |
+
quality_score = ast_quality_score(candidate_code, task)
|
| 118 |
+
pep8_score = style_score(candidate_code, task.style_max_line_length)
|
| 119 |
+
score = clamp_score((0.5 * test_fraction) + (0.3 * runtime_score) + (0.15 * quality_score) + (0.05 * pep8_score))
|
| 120 |
+
return TaskGrade(
|
| 121 |
+
score=score,
|
| 122 |
+
syntax_score=1.0,
|
| 123 |
+
tests_passed=execution.passed,
|
| 124 |
+
tests_total=execution.total,
|
| 125 |
+
quality_score=quality_score,
|
| 126 |
+
runtime_score=runtime_score,
|
| 127 |
+
details={
|
| 128 |
+
"tests": execution.output,
|
| 129 |
+
"benchmark": benchmark_output,
|
| 130 |
+
"test_fraction": round(test_fraction, 4),
|
| 131 |
+
"runtime_score": round(runtime_score, 4),
|
| 132 |
+
"style_score": round(pep8_score, 4),
|
| 133 |
+
},
|
| 134 |
+
)
|
| 135 |
+
|
pytest-cache-files-1f62ra1g/container_sim/server/graders/pytest_runner.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Helpers for deterministic pytest execution in temp sandboxes."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
import subprocess
|
| 7 |
+
import sys
|
| 8 |
+
import tempfile
|
| 9 |
+
from dataclasses import dataclass
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Iterable
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@dataclass(frozen=True)
|
| 15 |
+
class PytestExecution:
|
| 16 |
+
passed: int
|
| 17 |
+
failed: int
|
| 18 |
+
total: int
|
| 19 |
+
timed_out: bool
|
| 20 |
+
output: str
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def _test_module_source(tests: Iterable[str]) -> str:
|
| 24 |
+
blocks: list[str] = ["from candidate import * # noqa: F401,F403"]
|
| 25 |
+
for index, test in enumerate(tests, start=1):
|
| 26 |
+
snippet = str(test).strip()
|
| 27 |
+
if not snippet:
|
| 28 |
+
continue
|
| 29 |
+
if snippet.startswith("def test_"):
|
| 30 |
+
blocks.append(snippet)
|
| 31 |
+
continue
|
| 32 |
+
blocks.append(
|
| 33 |
+
"\n".join(
|
| 34 |
+
[
|
| 35 |
+
f"def test_case_{index:03d}():",
|
| 36 |
+
f" assert {snippet}",
|
| 37 |
+
]
|
| 38 |
+
)
|
| 39 |
+
)
|
| 40 |
+
return "\n\n".join(blocks) or "def test_placeholder():\n assert True\n"
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def _runner_script() -> str:
|
| 44 |
+
return """import json
|
| 45 |
+
import pathlib
|
| 46 |
+
import pytest
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
class Collector:
|
| 50 |
+
def __init__(self) -> None:
|
| 51 |
+
self.passed = 0
|
| 52 |
+
self.failed = 0
|
| 53 |
+
|
| 54 |
+
def pytest_runtest_logreport(self, report):
|
| 55 |
+
if report.when != "call":
|
| 56 |
+
return
|
| 57 |
+
if report.passed:
|
| 58 |
+
self.passed += 1
|
| 59 |
+
elif report.failed:
|
| 60 |
+
self.failed += 1
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
collector = Collector()
|
| 64 |
+
exit_code = pytest.main(["-q", "test_candidate.py"], plugins=[collector])
|
| 65 |
+
payload = {
|
| 66 |
+
"passed": collector.passed,
|
| 67 |
+
"failed": collector.failed,
|
| 68 |
+
"exit_code": int(exit_code),
|
| 69 |
+
}
|
| 70 |
+
pathlib.Path("pytest_results.json").write_text(json.dumps(payload), encoding="utf-8")
|
| 71 |
+
"""
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def run_pytest_suite(candidate_code: str, tests: Iterable[str], timeout_s: float = 3.0) -> PytestExecution:
|
| 75 |
+
test_cases = list(tests)
|
| 76 |
+
try:
|
| 77 |
+
with tempfile.TemporaryDirectory(prefix="python-code-review-") as temp_dir:
|
| 78 |
+
temp_path = Path(temp_dir)
|
| 79 |
+
(temp_path / "candidate.py").write_text(candidate_code, encoding="utf-8")
|
| 80 |
+
(temp_path / "test_candidate.py").write_text(_test_module_source(test_cases), encoding="utf-8")
|
| 81 |
+
(temp_path / "runner.py").write_text(_runner_script(), encoding="utf-8")
|
| 82 |
+
|
| 83 |
+
try:
|
| 84 |
+
completed = subprocess.run(
|
| 85 |
+
[sys.executable, "runner.py"],
|
| 86 |
+
cwd=temp_path,
|
| 87 |
+
capture_output=True,
|
| 88 |
+
text=True,
|
| 89 |
+
timeout=timeout_s,
|
| 90 |
+
check=False,
|
| 91 |
+
)
|
| 92 |
+
except subprocess.TimeoutExpired as exc:
|
| 93 |
+
output = (exc.stdout or "") + (exc.stderr or "")
|
| 94 |
+
return PytestExecution(
|
| 95 |
+
passed=0,
|
| 96 |
+
failed=max(len(test_cases), 1),
|
| 97 |
+
total=max(len(test_cases), 1),
|
| 98 |
+
timed_out=True,
|
| 99 |
+
output=(output or "pytest timed out").strip(),
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
result_path = temp_path / "pytest_results.json"
|
| 103 |
+
if not result_path.exists():
|
| 104 |
+
output = (completed.stdout or "") + (completed.stderr or "")
|
| 105 |
+
total = max(len(test_cases), 1)
|
| 106 |
+
return PytestExecution(0, total, total, False, output.strip())
|
| 107 |
+
|
| 108 |
+
try:
|
| 109 |
+
payload = json.loads(result_path.read_text(encoding="utf-8"))
|
| 110 |
+
except Exception as exc:
|
| 111 |
+
output = ((completed.stdout or "") + (completed.stderr or "")).strip()
|
| 112 |
+
return PytestExecution(0, max(len(test_cases), 1), max(len(test_cases), 1), False, (output or str(exc)).strip())
|
| 113 |
+
|
| 114 |
+
passed = int(payload.get("passed", 0))
|
| 115 |
+
failed = int(payload.get("failed", 0))
|
| 116 |
+
total = max(passed + failed, len(test_cases))
|
| 117 |
+
output = ((completed.stdout or "") + (completed.stderr or "")).strip()
|
| 118 |
+
return PytestExecution(passed, failed, total, False, output)
|
| 119 |
+
except Exception as exc:
|
| 120 |
+
return PytestExecution(0, max(len(test_cases), 1), max(len(test_cases), 1), False, str(exc))
|
| 121 |
+
|
pytest-cache-files-1f62ra1g/container_sim/server/graders/syntax.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Task graders for syntax and bug-fix tasks."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from .common import clamp_score, compiles, normalized_diff_score, style_score, syntax_error_message
|
| 6 |
+
from .optimization import grade_optimization_task
|
| 7 |
+
from .pytest_runner import run_pytest_suite
|
| 8 |
+
from ..models import TaskGrade
|
| 9 |
+
from ..tasks.task_bank import TaskSpec
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def grade_syntax_task(candidate_code: str, task: TaskSpec) -> TaskGrade:
|
| 13 |
+
error = syntax_error_message(candidate_code)
|
| 14 |
+
diff_score = normalized_diff_score(candidate_code, task.reference_code)
|
| 15 |
+
style_base = style_score(candidate_code, task.style_max_line_length)
|
| 16 |
+
if not error:
|
| 17 |
+
return TaskGrade(score=1.0, syntax_score=1.0, quality_score=style_base, details={"compile_error": ""})
|
| 18 |
+
partial = clamp_score(0.15 + (0.55 * diff_score))
|
| 19 |
+
return TaskGrade(score=partial, syntax_score=0.0, quality_score=diff_score * style_base, details={"compile_error": error})
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def grade_bug_fix_task(candidate_code: str, task: TaskSpec, include_hidden: bool = True) -> TaskGrade:
|
| 23 |
+
if not compiles(candidate_code):
|
| 24 |
+
error = syntax_error_message(candidate_code)
|
| 25 |
+
return TaskGrade(score=0.0, syntax_score=0.0, details={"compile_error": error})
|
| 26 |
+
|
| 27 |
+
tests = list(task.visible_tests)
|
| 28 |
+
if include_hidden:
|
| 29 |
+
tests.extend(task.hidden_tests)
|
| 30 |
+
|
| 31 |
+
execution = run_pytest_suite(candidate_code, tests, timeout_s=3.0)
|
| 32 |
+
if execution.timed_out:
|
| 33 |
+
return TaskGrade(
|
| 34 |
+
score=0.0,
|
| 35 |
+
syntax_score=1.0,
|
| 36 |
+
tests_passed=execution.passed,
|
| 37 |
+
tests_total=execution.total,
|
| 38 |
+
timed_out=True,
|
| 39 |
+
details={"compile_error": "", "tests": execution.output},
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
pass_fraction = execution.passed / execution.total if execution.total else 0.0
|
| 43 |
+
quality = style_score(candidate_code, task.style_max_line_length)
|
| 44 |
+
return TaskGrade(
|
| 45 |
+
score=clamp_score(pass_fraction),
|
| 46 |
+
syntax_score=1.0,
|
| 47 |
+
tests_passed=execution.passed,
|
| 48 |
+
tests_total=execution.total,
|
| 49 |
+
quality_score=quality,
|
| 50 |
+
details={"compile_error": "", "tests": execution.output},
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def grade_task(candidate_code: str, task: TaskSpec, include_hidden: bool = True) -> TaskGrade:
|
| 55 |
+
if task.task_kind == "syntax_fix":
|
| 56 |
+
return grade_syntax_task(candidate_code, task)
|
| 57 |
+
if task.task_kind == "bug_fix":
|
| 58 |
+
return grade_bug_fix_task(candidate_code, task, include_hidden=include_hidden)
|
| 59 |
+
return grade_optimization_task(candidate_code, task)
|
| 60 |
+
|
pytest-cache-files-1f62ra1g/container_sim/server/grading.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Deterministic grading helpers for PR-review tasks."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import re
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
from typing import Iterable, List, Optional, Sequence, Set
|
| 8 |
+
|
| 9 |
+
try:
|
| 10 |
+
from models import ReviewFinding, TaskGrade
|
| 11 |
+
from server.task_bank import RubricIssue, TaskSpec
|
| 12 |
+
except ModuleNotFoundError: # pragma: no cover
|
| 13 |
+
from ..models import ReviewFinding, TaskGrade
|
| 14 |
+
from .task_bank import RubricIssue, TaskSpec
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
FALSE_POSITIVE_PENALTY = 0.10
|
| 18 |
+
DUPLICATE_PENALTY = 0.05
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@dataclass(frozen=True)
|
| 22 |
+
class FindingMatch:
|
| 23 |
+
"""Result of matching one finding against the rubric."""
|
| 24 |
+
|
| 25 |
+
issue_id: Optional[str]
|
| 26 |
+
duplicate: bool = False
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def finding_fingerprint(finding: ReviewFinding) -> str:
|
| 30 |
+
"""Build a deterministic fingerprint for duplicate detection."""
|
| 31 |
+
|
| 32 |
+
text = " ".join(
|
| 33 |
+
[
|
| 34 |
+
finding.file_path,
|
| 35 |
+
str(finding.line or 0),
|
| 36 |
+
finding.category,
|
| 37 |
+
finding.severity,
|
| 38 |
+
finding.title,
|
| 39 |
+
finding.explanation,
|
| 40 |
+
finding.suggested_fix,
|
| 41 |
+
]
|
| 42 |
+
)
|
| 43 |
+
return "|".join(sorted(tokens(text)))
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def match_finding(
|
| 47 |
+
finding: ReviewFinding,
|
| 48 |
+
task: TaskSpec,
|
| 49 |
+
matched_issue_ids: Set[str],
|
| 50 |
+
seen_fingerprints: Set[str],
|
| 51 |
+
) -> FindingMatch:
|
| 52 |
+
"""Match one finding against the remaining rubric issues."""
|
| 53 |
+
|
| 54 |
+
fingerprint = finding_fingerprint(finding)
|
| 55 |
+
if fingerprint in seen_fingerprints:
|
| 56 |
+
return FindingMatch(issue_id=None, duplicate=True)
|
| 57 |
+
|
| 58 |
+
for issue in task.rubric_issues:
|
| 59 |
+
if issue.issue_id in matched_issue_ids:
|
| 60 |
+
continue
|
| 61 |
+
if finding_matches_issue(finding, issue):
|
| 62 |
+
return FindingMatch(issue_id=issue.issue_id)
|
| 63 |
+
return FindingMatch(issue_id=None)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def finding_matches_issue(finding: ReviewFinding, issue: RubricIssue) -> bool:
|
| 67 |
+
"""Return True when a finding deterministically matches a rubric issue."""
|
| 68 |
+
|
| 69 |
+
if finding.file_path != issue.file_path:
|
| 70 |
+
return False
|
| 71 |
+
if finding.category != issue.category:
|
| 72 |
+
return False
|
| 73 |
+
if finding.severity != issue.severity:
|
| 74 |
+
return False
|
| 75 |
+
if finding.line is None or abs(finding.line - issue.line) > 2:
|
| 76 |
+
return False
|
| 77 |
+
|
| 78 |
+
finding_tokens = tokens(
|
| 79 |
+
" ".join([finding.title, finding.explanation, finding.suggested_fix])
|
| 80 |
+
)
|
| 81 |
+
keyword_hits = sum(1 for keyword in issue.keywords if keyword in finding_tokens)
|
| 82 |
+
return keyword_hits >= issue.min_keyword_hits
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def score_task(
|
| 86 |
+
task: TaskSpec,
|
| 87 |
+
matched_issue_ids: Iterable[str],
|
| 88 |
+
false_positives: int = 0,
|
| 89 |
+
duplicate_findings: int = 0,
|
| 90 |
+
) -> TaskGrade:
|
| 91 |
+
"""Score a task from cumulative episode state."""
|
| 92 |
+
|
| 93 |
+
matched_set = set(matched_issue_ids)
|
| 94 |
+
matched_weight = sum(
|
| 95 |
+
issue.weight for issue in task.rubric_issues if issue.issue_id in matched_set
|
| 96 |
+
)
|
| 97 |
+
raw_score = matched_weight
|
| 98 |
+
raw_score -= false_positives * FALSE_POSITIVE_PENALTY
|
| 99 |
+
raw_score -= duplicate_findings * DUPLICATE_PENALTY
|
| 100 |
+
score = max(0.0, min(1.0, round(raw_score, 6)))
|
| 101 |
+
return TaskGrade(
|
| 102 |
+
score=score,
|
| 103 |
+
matched_issue_ids=sorted(matched_set),
|
| 104 |
+
false_positives=false_positives,
|
| 105 |
+
duplicate_findings=duplicate_findings,
|
| 106 |
+
matched_weight=min(1.0, round(matched_weight, 6)),
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def grade_findings(task: TaskSpec, findings: Sequence[ReviewFinding]) -> TaskGrade:
|
| 111 |
+
"""Offline-grade a batch of findings for one task."""
|
| 112 |
+
|
| 113 |
+
matched_issue_ids: Set[str] = set()
|
| 114 |
+
seen_fingerprints: Set[str] = set()
|
| 115 |
+
false_positives = 0
|
| 116 |
+
duplicate_findings = 0
|
| 117 |
+
|
| 118 |
+
for finding in findings:
|
| 119 |
+
result = match_finding(
|
| 120 |
+
finding=finding,
|
| 121 |
+
task=task,
|
| 122 |
+
matched_issue_ids=matched_issue_ids,
|
| 123 |
+
seen_fingerprints=seen_fingerprints,
|
| 124 |
+
)
|
| 125 |
+
fingerprint = finding_fingerprint(finding)
|
| 126 |
+
if result.duplicate:
|
| 127 |
+
duplicate_findings += 1
|
| 128 |
+
continue
|
| 129 |
+
seen_fingerprints.add(fingerprint)
|
| 130 |
+
if result.issue_id is None:
|
| 131 |
+
false_positives += 1
|
| 132 |
+
continue
|
| 133 |
+
matched_issue_ids.add(result.issue_id)
|
| 134 |
+
|
| 135 |
+
return score_task(
|
| 136 |
+
task=task,
|
| 137 |
+
matched_issue_ids=matched_issue_ids,
|
| 138 |
+
false_positives=false_positives,
|
| 139 |
+
duplicate_findings=duplicate_findings,
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def tokens(text: str) -> Set[str]:
|
| 144 |
+
"""Normalize free text into deterministic comparison tokens."""
|
| 145 |
+
|
| 146 |
+
return set(re.findall(r"[a-z0-9_]+", text.lower()))
|
| 147 |
+
|
pytest-cache-files-1f62ra1g/container_sim/server/models.py
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Typed models for the self-contained server package."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Any, Dict, List, Literal, Optional
|
| 6 |
+
|
| 7 |
+
from pydantic import BaseModel, Field
|
| 8 |
+
|
| 9 |
+
from .compat import Action, Observation, State
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
Difficulty = Literal["easy", "medium", "hard"]
|
| 13 |
+
TaskKind = Literal["syntax_fix", "bug_fix", "optimization"]
|
| 14 |
+
ActionType = Literal["analyze_code", "edit_code", "run_tests", "submit_solution"]
|
| 15 |
+
Category = Literal["bug", "security", "performance", "maintainability", "style", "testing"]
|
| 16 |
+
Severity = Literal["critical", "warning", "info"]
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class HistoryEntry(BaseModel):
|
| 20 |
+
step: int = Field(..., ge=0)
|
| 21 |
+
action_type: ActionType
|
| 22 |
+
status: str
|
| 23 |
+
reward: float
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class RewardDetails(BaseModel):
|
| 27 |
+
value: float
|
| 28 |
+
syntax_reward: float = 0.0
|
| 29 |
+
test_reward: float = 0.0
|
| 30 |
+
quality_bonus: float = 0.0
|
| 31 |
+
correctness_bonus: float = 0.0
|
| 32 |
+
progress_delta: float = 0.0
|
| 33 |
+
stagnation_penalty: float = 0.0
|
| 34 |
+
regression_penalty: float = 0.0
|
| 35 |
+
invalid_action_penalty: float = 0.0
|
| 36 |
+
timeout_penalty: float = 0.0
|
| 37 |
+
reason: str
|
| 38 |
+
prev_score: float = 0.0
|
| 39 |
+
curr_score: float = 0.0
|
| 40 |
+
code_changed: bool = False
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class PythonCodeReviewAction(Action):
|
| 44 |
+
action_type: ActionType
|
| 45 |
+
code: Optional[str] = None
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
class PythonCodeReviewObservation(Observation):
|
| 49 |
+
task_id: str
|
| 50 |
+
title: str = ""
|
| 51 |
+
difficulty: Difficulty
|
| 52 |
+
task_kind: Optional[TaskKind] = None
|
| 53 |
+
task_description: str
|
| 54 |
+
current_code: str
|
| 55 |
+
errors: str
|
| 56 |
+
test_results: str
|
| 57 |
+
visible_tests: List[str] = Field(default_factory=list)
|
| 58 |
+
history: List[HistoryEntry] = Field(default_factory=list)
|
| 59 |
+
attempts_remaining: int = Field(..., ge=0)
|
| 60 |
+
last_action_status: str = ""
|
| 61 |
+
score: float = Field(..., ge=0.0, le=1.0)
|
| 62 |
+
reward_details: RewardDetails = Field(
|
| 63 |
+
default_factory=lambda: RewardDetails(value=0.0, reason="Reset")
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
class PythonCodeReviewState(State):
|
| 68 |
+
episode_id: str
|
| 69 |
+
step_count: int = Field(default=0, ge=0)
|
| 70 |
+
task_id: Optional[str] = None
|
| 71 |
+
difficulty: Optional[Difficulty] = None
|
| 72 |
+
task_kind: Optional[TaskKind] = None
|
| 73 |
+
attempts_remaining: int = Field(default=0, ge=0)
|
| 74 |
+
current_code: str = ""
|
| 75 |
+
errors: str = ""
|
| 76 |
+
test_results: str = ""
|
| 77 |
+
history: List[HistoryEntry] = Field(default_factory=list)
|
| 78 |
+
score: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 79 |
+
done: bool = False
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
class TaskDescriptor(BaseModel):
|
| 83 |
+
task_id: str
|
| 84 |
+
title: str
|
| 85 |
+
difficulty: Difficulty
|
| 86 |
+
task_kind: Optional[TaskKind] = None
|
| 87 |
+
task_description: str = ""
|
| 88 |
+
starter_code: str = ""
|
| 89 |
+
visible_tests: List[str] = Field(default_factory=list)
|
| 90 |
+
goal: str = ""
|
| 91 |
+
repo_summary: str = ""
|
| 92 |
+
changed_files: List[str] = Field(default_factory=list)
|
| 93 |
+
available_files: List[str] = Field(default_factory=list)
|
| 94 |
+
max_steps: int = Field(..., ge=1)
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
class TaskSummary(BaseModel):
|
| 98 |
+
task_id: str
|
| 99 |
+
difficulty: Difficulty
|
| 100 |
+
title: str
|
| 101 |
+
goal: str = ""
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
class ReviewFinding(BaseModel):
|
| 105 |
+
title: str
|
| 106 |
+
file_path: str = ""
|
| 107 |
+
line: Optional[int] = Field(default=None, ge=1)
|
| 108 |
+
category: Category = "bug"
|
| 109 |
+
severity: Severity = "warning"
|
| 110 |
+
rationale: str = ""
|
| 111 |
+
recommendation: str = ""
|
| 112 |
+
rule_id: str = ""
|
| 113 |
+
|
| 114 |
+
@property
|
| 115 |
+
def explanation(self) -> str:
|
| 116 |
+
return self.rationale
|
| 117 |
+
|
| 118 |
+
@property
|
| 119 |
+
def suggested_fix(self) -> str:
|
| 120 |
+
return self.recommendation
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
class DirectReviewResponse(BaseModel):
|
| 124 |
+
issues: List[ReviewFinding] = Field(default_factory=list)
|
| 125 |
+
summary: str = ""
|
| 126 |
+
score: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 127 |
+
improved_code: Optional[str] = None
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
class TaskGrade(BaseModel):
|
| 131 |
+
score: float = Field(..., ge=0.0, le=1.0)
|
| 132 |
+
syntax_score: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 133 |
+
tests_passed: int = Field(default=0, ge=0)
|
| 134 |
+
tests_total: int = Field(default=0, ge=0)
|
| 135 |
+
quality_score: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 136 |
+
runtime_score: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 137 |
+
timed_out: bool = False
|
| 138 |
+
matched_issue_ids: List[str] = Field(default_factory=list)
|
| 139 |
+
false_positives: int = Field(default=0, ge=0)
|
| 140 |
+
duplicate_findings: int = Field(default=0, ge=0)
|
| 141 |
+
matched_weight: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 142 |
+
details: Dict[str, Any] = Field(default_factory=dict)
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
class HealthResponse(BaseModel):
|
| 146 |
+
status: Literal["ok"] = "ok"
|
| 147 |
+
environment: str = "python_code_review_env"
|
| 148 |
+
task_count: int = Field(default=0, ge=0)
|
| 149 |
+
|
pytest-cache-files-1f62ra1g/container_sim/server/python_env_environment.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Compatibility shim for older imports."""
|
| 2 |
+
|
| 3 |
+
try:
|
| 4 |
+
from server.code_review_environment import PythonEnvironment
|
| 5 |
+
except ModuleNotFoundError: # pragma: no cover
|
| 6 |
+
from .code_review_environment import PythonEnvironment
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
__all__ = ["PythonEnvironment"]
|
pytest-cache-files-1f62ra1g/container_sim/server/requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openenv-core[core]>=0.2.2
|
| 2 |
+
fastapi>=0.115.0
|
| 3 |
+
uvicorn[standard]>=0.30.0
|
| 4 |
+
openai>=1.40.0
|
| 5 |
+
pytest>=8.0.0
|
| 6 |
+
pydantic>=2.0.0
|
pytest-cache-files-1f62ra1g/container_sim/server/static_review.py
ADDED
|
@@ -0,0 +1,273 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Deterministic static-review helpers for arbitrary Python code.
|
| 2 |
+
|
| 3 |
+
Unlike the benchmark grader, this module does not compare against hidden rubric
|
| 4 |
+
items. Instead, it performs direct AST-based review on arbitrary snippets so it
|
| 5 |
+
can be used for manual testing, examples, and future dataset generation.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import ast
|
| 11 |
+
from typing import List, Optional
|
| 12 |
+
|
| 13 |
+
try:
|
| 14 |
+
from models import DirectReviewResponse, ReviewFinding
|
| 15 |
+
except ModuleNotFoundError: # pragma: no cover
|
| 16 |
+
from ..models import DirectReviewResponse, ReviewFinding
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class _StaticAnalyzer(ast.NodeVisitor):
|
| 20 |
+
"""AST visitor that emits structured review findings.
|
| 21 |
+
|
| 22 |
+
The visitor intentionally focuses on a small set of high-signal patterns so
|
| 23 |
+
the direct-review endpoint stays predictable and easy to understand.
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
def __init__(self) -> None:
|
| 27 |
+
self.issues: List[ReviewFinding] = []
|
| 28 |
+
|
| 29 |
+
def visit_FunctionDef(self, node: ast.FunctionDef) -> None: # noqa: N802
|
| 30 |
+
"""Flag mutable default arguments in function definitions."""
|
| 31 |
+
|
| 32 |
+
for default in list(node.args.defaults):
|
| 33 |
+
if isinstance(default, (ast.List, ast.Dict, ast.Set)):
|
| 34 |
+
self.issues.append(
|
| 35 |
+
ReviewFinding(
|
| 36 |
+
title="Mutable default argument",
|
| 37 |
+
line=getattr(default, "lineno", node.lineno),
|
| 38 |
+
category="bug",
|
| 39 |
+
severity="warning",
|
| 40 |
+
rationale=(
|
| 41 |
+
"Mutable defaults persist across calls and can leak state "
|
| 42 |
+
"between unrelated requests."
|
| 43 |
+
),
|
| 44 |
+
recommendation="Use None as the default and create the object inside the function.",
|
| 45 |
+
rule_id="mutable-default-list",
|
| 46 |
+
)
|
| 47 |
+
)
|
| 48 |
+
self.generic_visit(node)
|
| 49 |
+
|
| 50 |
+
def visit_Call(self, node: ast.Call) -> None: # noqa: N802
|
| 51 |
+
"""Inspect function calls for obviously unsafe or noisy patterns."""
|
| 52 |
+
|
| 53 |
+
func_name = self._call_name(node)
|
| 54 |
+
if func_name in {"eval", "exec"}:
|
| 55 |
+
self.issues.append(
|
| 56 |
+
ReviewFinding(
|
| 57 |
+
title=f"Avoid {func_name} on untrusted input",
|
| 58 |
+
line=node.lineno,
|
| 59 |
+
category="security",
|
| 60 |
+
severity="critical",
|
| 61 |
+
rationale=(
|
| 62 |
+
f"{func_name} executes arbitrary code and is unsafe on "
|
| 63 |
+
"user-controlled input."
|
| 64 |
+
),
|
| 65 |
+
recommendation="Use a safe parser or a whitelist-based evaluator.",
|
| 66 |
+
rule_id="avoid-eval" if func_name == "eval" else "avoid-exec",
|
| 67 |
+
)
|
| 68 |
+
)
|
| 69 |
+
if func_name.endswith("check_output") or func_name.endswith("run"):
|
| 70 |
+
for keyword in node.keywords:
|
| 71 |
+
# `shell=True` is only a problem when the command comes from a
|
| 72 |
+
# shell-parsed string, but this heuristic is high value for
|
| 73 |
+
# review and intentionally conservative.
|
| 74 |
+
if keyword.arg == "shell" and isinstance(keyword.value, ast.Constant) and keyword.value.value is True:
|
| 75 |
+
self.issues.append(
|
| 76 |
+
ReviewFinding(
|
| 77 |
+
title="shell=True with dynamic input",
|
| 78 |
+
line=node.lineno,
|
| 79 |
+
category="security",
|
| 80 |
+
severity="critical",
|
| 81 |
+
rationale=(
|
| 82 |
+
"shell=True executes through the shell and can allow "
|
| 83 |
+
"command injection when the command string is interpolated."
|
| 84 |
+
),
|
| 85 |
+
recommendation="Pass a list of arguments and keep shell=False.",
|
| 86 |
+
rule_id="shell-true-command-injection",
|
| 87 |
+
)
|
| 88 |
+
)
|
| 89 |
+
if func_name == "print":
|
| 90 |
+
self.issues.append(
|
| 91 |
+
ReviewFinding(
|
| 92 |
+
title="Print statement in application logic",
|
| 93 |
+
line=node.lineno,
|
| 94 |
+
category="style",
|
| 95 |
+
severity="info",
|
| 96 |
+
rationale="Production services should prefer structured logging over print statements.",
|
| 97 |
+
recommendation="Use the logging module or return the value to the caller.",
|
| 98 |
+
rule_id="print-statement",
|
| 99 |
+
)
|
| 100 |
+
)
|
| 101 |
+
self.generic_visit(node)
|
| 102 |
+
|
| 103 |
+
def visit_ExceptHandler(self, node: ast.ExceptHandler) -> None: # noqa: N802
|
| 104 |
+
"""Flag bare exception handlers that hide failures."""
|
| 105 |
+
|
| 106 |
+
if node.type is None:
|
| 107 |
+
self.issues.append(
|
| 108 |
+
ReviewFinding(
|
| 109 |
+
title="Bare except",
|
| 110 |
+
line=node.lineno,
|
| 111 |
+
category="maintainability",
|
| 112 |
+
severity="warning",
|
| 113 |
+
rationale="Bare except catches KeyboardInterrupt and other system-level exceptions.",
|
| 114 |
+
recommendation="Catch a specific exception and record the failure.",
|
| 115 |
+
rule_id="bare-except",
|
| 116 |
+
)
|
| 117 |
+
)
|
| 118 |
+
self.generic_visit(node)
|
| 119 |
+
|
| 120 |
+
def visit_For(self, node: ast.For) -> None: # noqa: N802
|
| 121 |
+
"""Look for list-membership checks nested in loops."""
|
| 122 |
+
|
| 123 |
+
for child in ast.walk(node):
|
| 124 |
+
if isinstance(child, ast.Compare) and any(
|
| 125 |
+
isinstance(operator, (ast.In, ast.NotIn)) for operator in child.ops
|
| 126 |
+
):
|
| 127 |
+
if isinstance(child.comparators[0], ast.Name):
|
| 128 |
+
self.issues.append(
|
| 129 |
+
ReviewFinding(
|
| 130 |
+
title="Potential quadratic membership check inside loop",
|
| 131 |
+
line=child.lineno,
|
| 132 |
+
category="performance",
|
| 133 |
+
severity="warning",
|
| 134 |
+
rationale=(
|
| 135 |
+
"Repeated membership checks against a list inside a loop "
|
| 136 |
+
"can degrade to quadratic runtime."
|
| 137 |
+
),
|
| 138 |
+
recommendation="Use a set or dict for O(1) membership checks.",
|
| 139 |
+
rule_id="quadratic-membership-check",
|
| 140 |
+
)
|
| 141 |
+
)
|
| 142 |
+
break
|
| 143 |
+
self.generic_visit(node)
|
| 144 |
+
|
| 145 |
+
@staticmethod
|
| 146 |
+
def _call_name(node: ast.Call) -> str:
|
| 147 |
+
"""Extract a dotted function name such as `subprocess.run`."""
|
| 148 |
+
|
| 149 |
+
func = node.func
|
| 150 |
+
if isinstance(func, ast.Name):
|
| 151 |
+
return func.id
|
| 152 |
+
if isinstance(func, ast.Attribute):
|
| 153 |
+
prefix = _StaticAnalyzer._attribute_prefix(func.value)
|
| 154 |
+
return f"{prefix}.{func.attr}" if prefix else func.attr
|
| 155 |
+
return ""
|
| 156 |
+
|
| 157 |
+
@staticmethod
|
| 158 |
+
def _attribute_prefix(node: ast.AST) -> str:
|
| 159 |
+
"""Reconstruct the left-hand side of an attribute chain."""
|
| 160 |
+
|
| 161 |
+
if isinstance(node, ast.Name):
|
| 162 |
+
return node.id
|
| 163 |
+
if isinstance(node, ast.Attribute):
|
| 164 |
+
prefix = _StaticAnalyzer._attribute_prefix(node.value)
|
| 165 |
+
return f"{prefix}.{node.attr}" if prefix else node.attr
|
| 166 |
+
return ""
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def analyze_python_code(code: str) -> List[ReviewFinding]:
|
| 170 |
+
"""Analyze arbitrary Python code and return structured findings."""
|
| 171 |
+
|
| 172 |
+
if not code.strip():
|
| 173 |
+
return [
|
| 174 |
+
ReviewFinding(
|
| 175 |
+
title="No code provided",
|
| 176 |
+
category="bug",
|
| 177 |
+
severity="warning",
|
| 178 |
+
rationale="The reviewer cannot inspect an empty submission.",
|
| 179 |
+
recommendation="Provide Python source code.",
|
| 180 |
+
rule_id="empty-input",
|
| 181 |
+
)
|
| 182 |
+
]
|
| 183 |
+
|
| 184 |
+
# Syntax errors are turned into findings rather than exceptions so API
|
| 185 |
+
# consumers always get a valid response shape.
|
| 186 |
+
try:
|
| 187 |
+
tree = ast.parse(code)
|
| 188 |
+
except SyntaxError as exc:
|
| 189 |
+
return [
|
| 190 |
+
ReviewFinding(
|
| 191 |
+
title="Syntax error",
|
| 192 |
+
line=exc.lineno,
|
| 193 |
+
category="bug",
|
| 194 |
+
severity="critical",
|
| 195 |
+
rationale=exc.msg,
|
| 196 |
+
recommendation="Fix the syntax error before running static review.",
|
| 197 |
+
rule_id="syntax-error",
|
| 198 |
+
)
|
| 199 |
+
]
|
| 200 |
+
|
| 201 |
+
analyzer = _StaticAnalyzer()
|
| 202 |
+
analyzer.visit(tree)
|
| 203 |
+
return _deduplicate(analyzer.issues)
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
def build_direct_review_response(
|
| 207 |
+
code: str, context: Optional[str] = None
|
| 208 |
+
) -> DirectReviewResponse:
|
| 209 |
+
"""Build the public direct-review response for the `/review` route."""
|
| 210 |
+
|
| 211 |
+
issues = analyze_python_code(code)
|
| 212 |
+
weighted_penalty = 0.0
|
| 213 |
+
# The direct-review score is intentionally simple: more severe issues lower
|
| 214 |
+
# the score more aggressively.
|
| 215 |
+
for issue in issues:
|
| 216 |
+
if issue.severity == "critical":
|
| 217 |
+
weighted_penalty += 0.3
|
| 218 |
+
elif issue.severity == "warning":
|
| 219 |
+
weighted_penalty += 0.15
|
| 220 |
+
else:
|
| 221 |
+
weighted_penalty += 0.05
|
| 222 |
+
|
| 223 |
+
score = max(0.0, min(1.0, 1.0 - weighted_penalty))
|
| 224 |
+
summary = _build_summary(issues, context)
|
| 225 |
+
improved_code = _suggest_improved_code(code, issues)
|
| 226 |
+
return DirectReviewResponse(
|
| 227 |
+
issues=issues,
|
| 228 |
+
summary=summary,
|
| 229 |
+
score=score,
|
| 230 |
+
improved_code=improved_code,
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
def _build_summary(issues: List[ReviewFinding], context: Optional[str]) -> str:
|
| 235 |
+
"""Create a concise human-readable summary for the direct-review response."""
|
| 236 |
+
|
| 237 |
+
if not issues:
|
| 238 |
+
base = "No obvious issues were detected by the deterministic reviewer."
|
| 239 |
+
else:
|
| 240 |
+
critical = sum(1 for issue in issues if issue.severity == "critical")
|
| 241 |
+
warnings = sum(1 for issue in issues if issue.severity == "warning")
|
| 242 |
+
infos = sum(1 for issue in issues if issue.severity == "info")
|
| 243 |
+
base = (
|
| 244 |
+
f"Detected {len(issues)} issue(s): {critical} critical, "
|
| 245 |
+
f"{warnings} warning, {infos} info."
|
| 246 |
+
)
|
| 247 |
+
if context:
|
| 248 |
+
return f"{base} Context: {context}"
|
| 249 |
+
return base
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
def _suggest_improved_code(code: str, issues: List[ReviewFinding]) -> Optional[str]:
|
| 253 |
+
"""Append high-level fix directions to the submitted code."""
|
| 254 |
+
|
| 255 |
+
if not issues:
|
| 256 |
+
return None
|
| 257 |
+
suggestions = [issue.recommendation for issue in issues if issue.recommendation]
|
| 258 |
+
comment = " | ".join(dict.fromkeys(suggestions))
|
| 259 |
+
return f"{code.rstrip()}\n\n# Suggested review directions: {comment}"
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
def _deduplicate(findings: List[ReviewFinding]) -> List[ReviewFinding]:
|
| 263 |
+
"""Drop duplicate findings that refer to the same rule and line."""
|
| 264 |
+
|
| 265 |
+
seen = set()
|
| 266 |
+
unique: List[ReviewFinding] = []
|
| 267 |
+
for finding in findings:
|
| 268 |
+
key = (finding.rule_id, finding.line, finding.category)
|
| 269 |
+
if key in seen:
|
| 270 |
+
continue
|
| 271 |
+
seen.add(key)
|
| 272 |
+
unique.append(finding)
|
| 273 |
+
return unique
|
pytest-cache-files-1f62ra1g/container_sim/server/task_bank.py
ADDED
|
@@ -0,0 +1,340 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Static PR-review tasks and hidden grading rubrics."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from dataclasses import dataclass, field
|
| 6 |
+
from typing import Dict, Iterable, List, Sequence
|
| 7 |
+
|
| 8 |
+
try:
|
| 9 |
+
from models import Category, Difficulty, Severity, TaskDescriptor, TaskSummary
|
| 10 |
+
except ModuleNotFoundError: # pragma: no cover
|
| 11 |
+
from ..models import Category, Difficulty, Severity, TaskDescriptor, TaskSummary
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@dataclass(frozen=True)
|
| 15 |
+
class RubricIssue:
|
| 16 |
+
"""One hidden issue that can be matched by the deterministic grader."""
|
| 17 |
+
|
| 18 |
+
issue_id: str
|
| 19 |
+
file_path: str
|
| 20 |
+
line: int
|
| 21 |
+
category: Category
|
| 22 |
+
severity: Severity
|
| 23 |
+
keywords: Sequence[str]
|
| 24 |
+
min_keyword_hits: int
|
| 25 |
+
weight: float
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
@dataclass(frozen=True)
|
| 29 |
+
class TaskSpec:
|
| 30 |
+
"""Complete task definition, including hidden rubric metadata."""
|
| 31 |
+
|
| 32 |
+
task_id: str
|
| 33 |
+
difficulty: Difficulty
|
| 34 |
+
title: str
|
| 35 |
+
goal: str
|
| 36 |
+
repo_summary: str
|
| 37 |
+
visible_diff: str
|
| 38 |
+
file_contents: Dict[str, str]
|
| 39 |
+
changed_files: Sequence[str]
|
| 40 |
+
rubric_issues: Sequence[RubricIssue]
|
| 41 |
+
max_steps: int
|
| 42 |
+
|
| 43 |
+
@property
|
| 44 |
+
def available_files(self) -> List[str]:
|
| 45 |
+
return list(self.file_contents.keys())
|
| 46 |
+
|
| 47 |
+
def to_descriptor(self) -> TaskDescriptor:
|
| 48 |
+
return TaskDescriptor(
|
| 49 |
+
task_id=self.task_id,
|
| 50 |
+
difficulty=self.difficulty,
|
| 51 |
+
title=self.title,
|
| 52 |
+
goal=self.goal,
|
| 53 |
+
repo_summary=self.repo_summary,
|
| 54 |
+
changed_files=list(self.changed_files),
|
| 55 |
+
available_files=self.available_files,
|
| 56 |
+
max_steps=self.max_steps,
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
def to_summary(self) -> TaskSummary:
|
| 60 |
+
return TaskSummary(
|
| 61 |
+
task_id=self.task_id,
|
| 62 |
+
difficulty=self.difficulty,
|
| 63 |
+
title=self.title,
|
| 64 |
+
goal=self.goal,
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
TASKS: List[TaskSpec] = [
|
| 69 |
+
TaskSpec(
|
| 70 |
+
task_id="py-pr-review-easy",
|
| 71 |
+
difficulty="easy",
|
| 72 |
+
title="Retry Delay Regression",
|
| 73 |
+
goal=(
|
| 74 |
+
"Review the pull request and identify the real bug introduced in the retry "
|
| 75 |
+
"delay helper before it ships."
|
| 76 |
+
),
|
| 77 |
+
repo_summary=(
|
| 78 |
+
"This service computes retry delays for background notification delivery. "
|
| 79 |
+
"The change is intended to relax validation for legacy callers."
|
| 80 |
+
),
|
| 81 |
+
visible_diff="\n".join(
|
| 82 |
+
[
|
| 83 |
+
"diff --git a/src/notifications/retry.py b/src/notifications/retry.py",
|
| 84 |
+
"@@",
|
| 85 |
+
"- if base_delay <= 0:",
|
| 86 |
+
"+ if base_delay < 0:",
|
| 87 |
+
" return 0.0",
|
| 88 |
+
]
|
| 89 |
+
),
|
| 90 |
+
file_contents={
|
| 91 |
+
"src/notifications/retry.py": "\n".join(
|
| 92 |
+
[
|
| 93 |
+
"from __future__ import annotations",
|
| 94 |
+
"",
|
| 95 |
+
"def calculate_retry_delay(attempt: int, base_delay: float = 2.0) -> float:",
|
| 96 |
+
' """Return the retry delay in seconds."""',
|
| 97 |
+
" if attempt < 0:",
|
| 98 |
+
' raise ValueError(\"attempt must be >= 0\")',
|
| 99 |
+
" if base_delay < 0:",
|
| 100 |
+
" return 0.0",
|
| 101 |
+
" return attempt / base_delay",
|
| 102 |
+
]
|
| 103 |
+
)
|
| 104 |
+
},
|
| 105 |
+
changed_files=("src/notifications/retry.py",),
|
| 106 |
+
rubric_issues=(
|
| 107 |
+
RubricIssue(
|
| 108 |
+
issue_id="zero-base-delay-divides",
|
| 109 |
+
file_path="src/notifications/retry.py",
|
| 110 |
+
line=7,
|
| 111 |
+
category="bug",
|
| 112 |
+
severity="warning",
|
| 113 |
+
keywords=("zero", "division", "base_delay"),
|
| 114 |
+
min_keyword_hits=2,
|
| 115 |
+
weight=1.0,
|
| 116 |
+
),
|
| 117 |
+
),
|
| 118 |
+
max_steps=4,
|
| 119 |
+
),
|
| 120 |
+
TaskSpec(
|
| 121 |
+
task_id="py-pr-review-medium",
|
| 122 |
+
difficulty="medium",
|
| 123 |
+
title="Coupon Billing Rollout",
|
| 124 |
+
goal=(
|
| 125 |
+
"Review the billing change and identify both the production regression and "
|
| 126 |
+
"the missing coverage that would have caught it."
|
| 127 |
+
),
|
| 128 |
+
repo_summary=(
|
| 129 |
+
"The billing service is adding coupon support for one-off invoices. The PR "
|
| 130 |
+
"touches both the service code and its unit tests."
|
| 131 |
+
),
|
| 132 |
+
visible_diff="\n".join(
|
| 133 |
+
[
|
| 134 |
+
"diff --git a/app/billing/invoice_service.py b/app/billing/invoice_service.py",
|
| 135 |
+
"@@",
|
| 136 |
+
" def charge_invoice(order: dict, gateway: Gateway) -> str:",
|
| 137 |
+
"- return gateway.charge(order[\"customer_id\"], order[\"amount_cents\"])",
|
| 138 |
+
"+ total = order[\"amount_cents\"]",
|
| 139 |
+
"+ coupon = order.get(\"coupon_code\")",
|
| 140 |
+
"+ if coupon:",
|
| 141 |
+
"+ discount = gateway.lookup_discount(coupon)",
|
| 142 |
+
"+ total = max(total - discount, 0)",
|
| 143 |
+
"+ return gateway.charge(order[\"customer_id\"], order[\"amount_cents\"])",
|
| 144 |
+
"",
|
| 145 |
+
"diff --git a/tests/test_invoice_service.py b/tests/test_invoice_service.py",
|
| 146 |
+
"@@",
|
| 147 |
+
" class FakeGateway:",
|
| 148 |
+
"+ def lookup_discount(self, coupon: str) -> int:",
|
| 149 |
+
"+ return 250",
|
| 150 |
+
]
|
| 151 |
+
),
|
| 152 |
+
file_contents={
|
| 153 |
+
"app/billing/invoice_service.py": "\n".join(
|
| 154 |
+
[
|
| 155 |
+
"from gateway import Gateway",
|
| 156 |
+
"",
|
| 157 |
+
"def charge_invoice(order: dict, gateway: Gateway) -> str:",
|
| 158 |
+
' total = order["amount_cents"]',
|
| 159 |
+
' coupon = order.get("coupon_code")',
|
| 160 |
+
" if coupon:",
|
| 161 |
+
" discount = gateway.lookup_discount(coupon)",
|
| 162 |
+
" total = max(total - discount, 0)",
|
| 163 |
+
' return gateway.charge(order["customer_id"], order["amount_cents"])',
|
| 164 |
+
]
|
| 165 |
+
),
|
| 166 |
+
"tests/test_invoice_service.py": "\n".join(
|
| 167 |
+
[
|
| 168 |
+
"from app.billing.invoice_service import charge_invoice",
|
| 169 |
+
"",
|
| 170 |
+
"class FakeGateway:",
|
| 171 |
+
" def lookup_discount(self, coupon: str) -> int:",
|
| 172 |
+
" return 250",
|
| 173 |
+
"",
|
| 174 |
+
" def charge(self, customer_id: str, amount_cents: int) -> str:",
|
| 175 |
+
" self.last_charge = (customer_id, amount_cents)",
|
| 176 |
+
' return "charge_123"',
|
| 177 |
+
"",
|
| 178 |
+
"def test_charge_invoice_without_coupon():",
|
| 179 |
+
" gateway = FakeGateway()",
|
| 180 |
+
' charge_invoice({"customer_id": "cus_1", "amount_cents": 1000}, gateway)',
|
| 181 |
+
' assert gateway.last_charge == ("cus_1", 1000)',
|
| 182 |
+
]
|
| 183 |
+
),
|
| 184 |
+
},
|
| 185 |
+
changed_files=("app/billing/invoice_service.py", "tests/test_invoice_service.py"),
|
| 186 |
+
rubric_issues=(
|
| 187 |
+
RubricIssue(
|
| 188 |
+
issue_id="discount-total-unused",
|
| 189 |
+
file_path="app/billing/invoice_service.py",
|
| 190 |
+
line=8,
|
| 191 |
+
category="bug",
|
| 192 |
+
severity="warning",
|
| 193 |
+
keywords=("discount", "total", "charge", "amount"),
|
| 194 |
+
min_keyword_hits=2,
|
| 195 |
+
weight=0.6,
|
| 196 |
+
),
|
| 197 |
+
RubricIssue(
|
| 198 |
+
issue_id="missing-coupon-test",
|
| 199 |
+
file_path="tests/test_invoice_service.py",
|
| 200 |
+
line=11,
|
| 201 |
+
category="testing",
|
| 202 |
+
severity="warning",
|
| 203 |
+
keywords=("missing", "test", "coupon", "discount"),
|
| 204 |
+
min_keyword_hits=2,
|
| 205 |
+
weight=0.4,
|
| 206 |
+
),
|
| 207 |
+
),
|
| 208 |
+
max_steps=5,
|
| 209 |
+
),
|
| 210 |
+
TaskSpec(
|
| 211 |
+
task_id="py-pr-review-hard",
|
| 212 |
+
difficulty="hard",
|
| 213 |
+
title="Async Job Runner Deduplication",
|
| 214 |
+
goal=(
|
| 215 |
+
"Review the async job-runner PR and find the subtle concurrency issues "
|
| 216 |
+
"without inventing extra problems."
|
| 217 |
+
),
|
| 218 |
+
repo_summary=(
|
| 219 |
+
"A shared webhook backfill service is deduplicating in-flight work with an "
|
| 220 |
+
"async task cache and writing the latest result for operators to inspect."
|
| 221 |
+
),
|
| 222 |
+
visible_diff="\n".join(
|
| 223 |
+
[
|
| 224 |
+
"diff --git a/app/jobs/runner.py b/app/jobs/runner.py",
|
| 225 |
+
"@@",
|
| 226 |
+
" async def run_job(job_id: str, payload: dict, worker) -> str:",
|
| 227 |
+
" if job_id in ACTIVE_RUNS:",
|
| 228 |
+
" return await ACTIVE_RUNS[job_id]",
|
| 229 |
+
"+ lock = asyncio.Lock()",
|
| 230 |
+
"+ async with lock:",
|
| 231 |
+
"+ task = asyncio.create_task(worker.run(payload))",
|
| 232 |
+
"+ ACTIVE_RUNS[job_id] = task",
|
| 233 |
+
" try:",
|
| 234 |
+
" result = await task",
|
| 235 |
+
" finally:",
|
| 236 |
+
" ACTIVE_RUNS.pop(job_id, None)",
|
| 237 |
+
"+ Path(\"latest-result.json\").write_text(result)",
|
| 238 |
+
" return result",
|
| 239 |
+
]
|
| 240 |
+
),
|
| 241 |
+
file_contents={
|
| 242 |
+
"app/jobs/runner.py": "\n".join(
|
| 243 |
+
[
|
| 244 |
+
"import asyncio",
|
| 245 |
+
"from pathlib import Path",
|
| 246 |
+
"",
|
| 247 |
+
"ACTIVE_RUNS: dict[str, asyncio.Task[str]] = {}",
|
| 248 |
+
"",
|
| 249 |
+
"async def run_job(job_id: str, payload: dict, worker) -> str:",
|
| 250 |
+
" if job_id in ACTIVE_RUNS:",
|
| 251 |
+
" return await ACTIVE_RUNS[job_id]",
|
| 252 |
+
"",
|
| 253 |
+
" lock = asyncio.Lock()",
|
| 254 |
+
" async with lock:",
|
| 255 |
+
" task = asyncio.create_task(worker.run(payload))",
|
| 256 |
+
" ACTIVE_RUNS[job_id] = task",
|
| 257 |
+
" try:",
|
| 258 |
+
" result = await task",
|
| 259 |
+
" finally:",
|
| 260 |
+
" ACTIVE_RUNS.pop(job_id, None)",
|
| 261 |
+
"",
|
| 262 |
+
' Path("latest-result.json").write_text(result)',
|
| 263 |
+
" return result",
|
| 264 |
+
]
|
| 265 |
+
),
|
| 266 |
+
"tests/test_runner.py": "\n".join(
|
| 267 |
+
[
|
| 268 |
+
"import pytest",
|
| 269 |
+
"",
|
| 270 |
+
"from app.jobs.runner import run_job",
|
| 271 |
+
"",
|
| 272 |
+
"class FakeWorker:",
|
| 273 |
+
" async def run(self, payload: dict) -> str:",
|
| 274 |
+
' return payload["job_id"]',
|
| 275 |
+
"",
|
| 276 |
+
"@pytest.mark.asyncio",
|
| 277 |
+
"async def test_run_job_returns_worker_result():",
|
| 278 |
+
" worker = FakeWorker()",
|
| 279 |
+
' result = await run_job("job-1", {"job_id": "job-1"}, worker)',
|
| 280 |
+
' assert result == "job-1"',
|
| 281 |
+
]
|
| 282 |
+
),
|
| 283 |
+
},
|
| 284 |
+
changed_files=("app/jobs/runner.py", "tests/test_runner.py"),
|
| 285 |
+
rubric_issues=(
|
| 286 |
+
RubricIssue(
|
| 287 |
+
issue_id="per-call-lock-race",
|
| 288 |
+
file_path="app/jobs/runner.py",
|
| 289 |
+
line=9,
|
| 290 |
+
category="bug",
|
| 291 |
+
severity="warning",
|
| 292 |
+
keywords=("lock", "race", "concurrent", "duplicate"),
|
| 293 |
+
min_keyword_hits=2,
|
| 294 |
+
weight=0.55,
|
| 295 |
+
),
|
| 296 |
+
RubricIssue(
|
| 297 |
+
issue_id="shared-output-file-race",
|
| 298 |
+
file_path="app/jobs/runner.py",
|
| 299 |
+
line=18,
|
| 300 |
+
category="maintainability",
|
| 301 |
+
severity="warning",
|
| 302 |
+
keywords=("latest", "result", "file", "concurrent", "overwrite"),
|
| 303 |
+
min_keyword_hits=2,
|
| 304 |
+
weight=0.45,
|
| 305 |
+
),
|
| 306 |
+
),
|
| 307 |
+
max_steps=6,
|
| 308 |
+
),
|
| 309 |
+
]
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
TASKS_BY_ID: Dict[str, TaskSpec] = {task.task_id: task for task in TASKS}
|
| 313 |
+
|
| 314 |
+
|
| 315 |
+
def list_task_descriptors() -> List[TaskDescriptor]:
|
| 316 |
+
"""Return public descriptors for all tasks."""
|
| 317 |
+
|
| 318 |
+
return [task.to_descriptor() for task in TASKS]
|
| 319 |
+
|
| 320 |
+
|
| 321 |
+
def list_task_summaries() -> List[TaskSummary]:
|
| 322 |
+
"""Return task summaries for lightweight route responses."""
|
| 323 |
+
|
| 324 |
+
return [task.to_summary() for task in TASKS]
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
def get_task(task_id: str) -> TaskSpec:
|
| 328 |
+
"""Return a task by id."""
|
| 329 |
+
|
| 330 |
+
try:
|
| 331 |
+
return TASKS_BY_ID[task_id]
|
| 332 |
+
except KeyError as exc: # pragma: no cover
|
| 333 |
+
raise ValueError(f"Unknown task_id: {task_id}") from exc
|
| 334 |
+
|
| 335 |
+
|
| 336 |
+
def task_ids() -> Iterable[str]:
|
| 337 |
+
"""Return task ids in benchmark order."""
|
| 338 |
+
|
| 339 |
+
return [task.task_id for task in TASKS]
|
| 340 |
+
|
pytest-cache-files-1f62ra1g/container_sim/server/tasks/__init__.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Self-contained task definitions for container builds."""
|
| 2 |
+
|
| 3 |
+
from .task_bank import TaskSpec, get_task, list_task_descriptors, list_task_summaries, task_ids
|
| 4 |
+
|
| 5 |
+
__all__ = [
|
| 6 |
+
"TaskSpec",
|
| 7 |
+
"get_task",
|
| 8 |
+
"list_task_descriptors",
|
| 9 |
+
"list_task_summaries",
|
| 10 |
+
"task_ids",
|
| 11 |
+
]
|
| 12 |
+
|
pytest-cache-files-1f62ra1g/container_sim/server/tasks/task_bank.py
ADDED
|
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Deterministic task bank for self-contained server builds."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from dataclasses import dataclass, field
|
| 6 |
+
from typing import Dict, List, Optional
|
| 7 |
+
|
| 8 |
+
from ..models import Difficulty, TaskDescriptor, TaskKind
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@dataclass(frozen=True)
|
| 12 |
+
class TaskSpec:
|
| 13 |
+
task_id: str
|
| 14 |
+
title: str
|
| 15 |
+
difficulty: Difficulty
|
| 16 |
+
task_kind: TaskKind
|
| 17 |
+
task_description: str
|
| 18 |
+
starter_code: str
|
| 19 |
+
reference_code: str
|
| 20 |
+
visible_tests: List[str]
|
| 21 |
+
hidden_tests: List[str]
|
| 22 |
+
max_steps: int = 10
|
| 23 |
+
benchmark_entrypoint: Optional[str] = None
|
| 24 |
+
benchmark_builder: Optional[str] = None
|
| 25 |
+
benchmark_repeats: int = 1
|
| 26 |
+
benchmark_timeout_s: float = 2.0
|
| 27 |
+
style_max_line_length: int = 88
|
| 28 |
+
expected_quality_markers: List[str] = field(default_factory=list)
|
| 29 |
+
|
| 30 |
+
def to_descriptor(self) -> TaskDescriptor:
|
| 31 |
+
return TaskDescriptor(
|
| 32 |
+
task_id=self.task_id,
|
| 33 |
+
title=self.title,
|
| 34 |
+
difficulty=self.difficulty,
|
| 35 |
+
task_kind=self.task_kind,
|
| 36 |
+
task_description=self.task_description,
|
| 37 |
+
starter_code=self.starter_code,
|
| 38 |
+
visible_tests=list(self.visible_tests),
|
| 39 |
+
max_steps=self.max_steps,
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
TASK_SYNTAX_FIX = TaskSpec(
|
| 44 |
+
task_id="syntax-fix-easy",
|
| 45 |
+
title="Fix a syntax-broken username normalizer",
|
| 46 |
+
difficulty="easy",
|
| 47 |
+
task_kind="syntax_fix",
|
| 48 |
+
task_description=(
|
| 49 |
+
"You are reviewing a utility function before merge. The submitted patch left "
|
| 50 |
+
"the function with syntax errors. Repair the code so it compiles and preserves "
|
| 51 |
+
"the intended behavior of trimming, lowercasing, and replacing spaces with underscores."
|
| 52 |
+
),
|
| 53 |
+
starter_code='''def normalize_username(raw_name: str) -> str:
|
| 54 |
+
cleaned = raw_name.strip().lower(
|
| 55 |
+
if not cleaned:
|
| 56 |
+
return "anonymous"
|
| 57 |
+
return cleaned.replace(" ", "_")
|
| 58 |
+
''',
|
| 59 |
+
reference_code='''def normalize_username(raw_name: str) -> str:
|
| 60 |
+
cleaned = raw_name.strip().lower()
|
| 61 |
+
if not cleaned:
|
| 62 |
+
return "anonymous"
|
| 63 |
+
return cleaned.replace(" ", "_")
|
| 64 |
+
''',
|
| 65 |
+
visible_tests=[
|
| 66 |
+
"normalize_username(' Alice Smith ') == 'alice_smith'",
|
| 67 |
+
"normalize_username(' ') == 'anonymous'",
|
| 68 |
+
"normalize_username('Bob') == 'bob'",
|
| 69 |
+
],
|
| 70 |
+
hidden_tests=[
|
| 71 |
+
"normalize_username(' HELLO WORLD ') == 'hello_world'",
|
| 72 |
+
"normalize_username('') == 'anonymous'",
|
| 73 |
+
],
|
| 74 |
+
max_steps=8,
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
TASK_BUG_FIX = TaskSpec(
|
| 79 |
+
task_id="bug-fix-medium",
|
| 80 |
+
title="Repair invoice discount calculation logic",
|
| 81 |
+
difficulty="medium",
|
| 82 |
+
task_kind="bug_fix",
|
| 83 |
+
task_description=(
|
| 84 |
+
"A billing helper function is returning the wrong amount after applying discounts. "
|
| 85 |
+
"The function signature is correct, but the calculation logic is broken. "
|
| 86 |
+
"Inspect the implementation, run visible tests, and fix the bug so all tests pass. "
|
| 87 |
+
"Do not change the function signature or validation logic."
|
| 88 |
+
),
|
| 89 |
+
starter_code='''from typing import Iterable
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def calculate_invoice_total(line_items: Iterable[int], discount_percent: int) -> int:
|
| 93 |
+
"""Calculate invoice total with discount applied."""
|
| 94 |
+
if discount_percent < 0 or discount_percent > 100:
|
| 95 |
+
raise ValueError("discount_percent must be between 0 and 100")
|
| 96 |
+
|
| 97 |
+
subtotal = sum(line_items)
|
| 98 |
+
discounted_total = subtotal - (subtotal * discount_percent // 100)
|
| 99 |
+
return subtotal
|
| 100 |
+
''',
|
| 101 |
+
reference_code='''from typing import Iterable
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def calculate_invoice_total(line_items: Iterable[int], discount_percent: int) -> int:
|
| 105 |
+
"""Calculate invoice total with discount applied."""
|
| 106 |
+
if discount_percent < 0 or discount_percent > 100:
|
| 107 |
+
raise ValueError("discount_percent must be between 0 and 100")
|
| 108 |
+
|
| 109 |
+
subtotal = sum(line_items)
|
| 110 |
+
discounted_total = subtotal - (subtotal * discount_percent // 100)
|
| 111 |
+
return discounted_total
|
| 112 |
+
''',
|
| 113 |
+
visible_tests=[
|
| 114 |
+
"calculate_invoice_total([1000, 2000], 0) == 3000",
|
| 115 |
+
"calculate_invoice_total([1000, 2000], 50) == 1500",
|
| 116 |
+
"calculate_invoice_total([1000], 10) == 900",
|
| 117 |
+
"calculate_invoice_total([], 0) == 0",
|
| 118 |
+
],
|
| 119 |
+
hidden_tests=[
|
| 120 |
+
"calculate_invoice_total([100, 200, 300], 25) == 450",
|
| 121 |
+
"calculate_invoice_total([5000], 99) == 50",
|
| 122 |
+
],
|
| 123 |
+
max_steps=10,
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
TASK_OPTIMIZATION = TaskSpec(
|
| 128 |
+
task_id="optimization-hard",
|
| 129 |
+
title="Optimize inefficient user activity summarization",
|
| 130 |
+
difficulty="hard",
|
| 131 |
+
task_kind="optimization",
|
| 132 |
+
task_description=(
|
| 133 |
+
"Code review found that `summarize_user_activity` is inefficient for large event streams. "
|
| 134 |
+
"The current implementation repeatedly scans the full event list for every user, making it O(n**2). "
|
| 135 |
+
"Refactor it to aggregate counts in one pass while preserving the sorted output contract. "
|
| 136 |
+
"Style and code quality also matter: use idiomatic Python, proper types, and clear logic. "
|
| 137 |
+
"All tests must pass, and the optimized version should be measurably faster."
|
| 138 |
+
),
|
| 139 |
+
starter_code='''from typing import Iterable
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def summarize_user_activity(events: Iterable[dict]) -> list[tuple[str, int]]:
|
| 143 |
+
"""Aggregate user activity counts."""
|
| 144 |
+
|
| 145 |
+
ordered_users = []
|
| 146 |
+
for event in events:
|
| 147 |
+
user_id = event["user_id"]
|
| 148 |
+
if user_id not in ordered_users:
|
| 149 |
+
ordered_users.append(user_id)
|
| 150 |
+
|
| 151 |
+
summary = []
|
| 152 |
+
for user_id in ordered_users:
|
| 153 |
+
count = 0
|
| 154 |
+
for event in events:
|
| 155 |
+
if event["user_id"] == user_id:
|
| 156 |
+
count += 1
|
| 157 |
+
summary.append((user_id, count))
|
| 158 |
+
return sorted(summary, key=lambda item: (-item[1], item[0]))
|
| 159 |
+
''',
|
| 160 |
+
reference_code='''from collections import Counter
|
| 161 |
+
from typing import Iterable
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
def summarize_user_activity(events: Iterable[dict]) -> list[tuple[str, int]]:
|
| 165 |
+
"""Aggregate user activity counts in one pass."""
|
| 166 |
+
|
| 167 |
+
counts = Counter(event["user_id"] for event in events)
|
| 168 |
+
return sorted(counts.items(), key=lambda item: (-item[1], item[0]))
|
| 169 |
+
''',
|
| 170 |
+
visible_tests=[
|
| 171 |
+
"summarize_user_activity([{'user_id': 'alice'}, {'user_id': 'bob'}, {'user_id': 'alice'}]) == [('alice', 2), ('bob', 1)]",
|
| 172 |
+
"summarize_user_activity([{'user_id': 'z'}, {'user_id': 'a'}]) == [('a', 1), ('z', 1)]",
|
| 173 |
+
"summarize_user_activity([]) == []",
|
| 174 |
+
"summarize_user_activity([{'user_id': 'solo'}]) == [('solo', 1)]",
|
| 175 |
+
],
|
| 176 |
+
hidden_tests=[
|
| 177 |
+
"summarize_user_activity([{'user_id': 'u2'}, {'user_id': 'u1'}, {'user_id': 'u2'}, {'user_id': 'u2'}, {'user_id': 'u1'}]) == [('u2', 3), ('u1', 2)]",
|
| 178 |
+
],
|
| 179 |
+
max_steps=10,
|
| 180 |
+
benchmark_entrypoint="summarize_user_activity",
|
| 181 |
+
benchmark_builder='''def build_benchmark_events():
|
| 182 |
+
return [{"user_id": f"user_{index % 400}"} for index in range(6000)]''',
|
| 183 |
+
benchmark_repeats=3,
|
| 184 |
+
benchmark_timeout_s=1.0,
|
| 185 |
+
style_max_line_length=88,
|
| 186 |
+
expected_quality_markers=["Counter", "sorted"],
|
| 187 |
+
)
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
TASKS: Dict[str, TaskSpec] = {
|
| 191 |
+
"syntax-fix-easy": TASK_SYNTAX_FIX,
|
| 192 |
+
"bug-fix-medium": TASK_BUG_FIX,
|
| 193 |
+
"optimization-hard": TASK_OPTIMIZATION,
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def task_ids() -> List[str]:
|
| 198 |
+
return ["syntax-fix-easy", "bug-fix-medium", "optimization-hard"]
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
def get_task(task_id: str) -> TaskSpec:
|
| 202 |
+
if task_id not in TASKS:
|
| 203 |
+
raise ValueError(f"Task {task_id} not found. Available: {list(TASKS.keys())}")
|
| 204 |
+
return TASKS[task_id]
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
def list_task_descriptors() -> List[TaskDescriptor]:
|
| 208 |
+
return [get_task(tid).to_descriptor() for tid in task_ids()]
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
def list_task_summaries() -> List[TaskDescriptor]:
|
| 212 |
+
return list_task_descriptors()
|
| 213 |
+
|
server/app.py
CHANGED
|
@@ -7,17 +7,27 @@ import os
|
|
| 7 |
from fastapi import APIRouter, HTTPException
|
| 8 |
from fastapi.responses import RedirectResponse
|
| 9 |
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
from models import (
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
)
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
|
| 23 |
try:
|
|
|
|
| 7 |
from fastapi import APIRouter, HTTPException
|
| 8 |
from fastapi.responses import RedirectResponse
|
| 9 |
|
| 10 |
+
try:
|
| 11 |
+
from compat import create_app
|
| 12 |
+
from models import (
|
| 13 |
+
HealthResponse,
|
| 14 |
+
PythonCodeReviewAction,
|
| 15 |
+
PythonCodeReviewObservation,
|
| 16 |
+
PythonCodeReviewState,
|
| 17 |
+
TaskDescriptor,
|
| 18 |
+
TaskGrade,
|
| 19 |
+
)
|
| 20 |
+
except Exception:
|
| 21 |
+
from .compat import create_app
|
| 22 |
+
from .models import (
|
| 23 |
+
HealthResponse,
|
| 24 |
+
PythonCodeReviewAction,
|
| 25 |
+
PythonCodeReviewObservation,
|
| 26 |
+
PythonCodeReviewState,
|
| 27 |
+
TaskDescriptor,
|
| 28 |
+
TaskGrade,
|
| 29 |
+
)
|
| 30 |
+
from server.env import PythonCodeReviewEnvironment
|
| 31 |
|
| 32 |
|
| 33 |
try:
|
server/compat.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Compatibility helpers for OpenEnv and FastMCP runtime drift."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import sys
|
| 6 |
+
import types
|
| 7 |
+
from typing import Any
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def install_openenv_fastmcp_compat() -> None:
|
| 11 |
+
"""Patch FastMCP API differences so older OpenEnv builds keep importing."""
|
| 12 |
+
try:
|
| 13 |
+
import fastmcp # type: ignore
|
| 14 |
+
except Exception:
|
| 15 |
+
return
|
| 16 |
+
|
| 17 |
+
try:
|
| 18 |
+
if not hasattr(fastmcp, "Client"):
|
| 19 |
+
class CompatClient:
|
| 20 |
+
"""Minimal async MCP client used for legacy OpenEnv imports."""
|
| 21 |
+
|
| 22 |
+
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
| 23 |
+
self.args = args
|
| 24 |
+
self.kwargs = kwargs
|
| 25 |
+
|
| 26 |
+
async def __aenter__(self) -> "CompatClient":
|
| 27 |
+
return self
|
| 28 |
+
|
| 29 |
+
async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> bool:
|
| 30 |
+
return False
|
| 31 |
+
|
| 32 |
+
async def list_tools(self) -> list[Any]:
|
| 33 |
+
return []
|
| 34 |
+
|
| 35 |
+
async def call_tool(self, tool_name: str, arguments: dict[str, Any]) -> Any:
|
| 36 |
+
raise RuntimeError(
|
| 37 |
+
f"MCP client compatibility mode cannot call tool: {tool_name}"
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
fastmcp.Client = CompatClient # type: ignore[attr-defined]
|
| 41 |
+
except Exception:
|
| 42 |
+
pass
|
| 43 |
+
|
| 44 |
+
try:
|
| 45 |
+
client_pkg = sys.modules.get("fastmcp.client")
|
| 46 |
+
if client_pkg is None:
|
| 47 |
+
client_pkg = types.ModuleType("fastmcp.client")
|
| 48 |
+
sys.modules["fastmcp.client"] = client_pkg
|
| 49 |
+
|
| 50 |
+
client_mod = sys.modules.get("fastmcp.client.client")
|
| 51 |
+
if client_mod is None:
|
| 52 |
+
client_mod = types.ModuleType("fastmcp.client.client")
|
| 53 |
+
sys.modules["fastmcp.client.client"] = client_mod
|
| 54 |
+
|
| 55 |
+
if not hasattr(client_mod, "CallToolResult"):
|
| 56 |
+
class CallToolResult:
|
| 57 |
+
"""Compatibility container for legacy OpenEnv response handling."""
|
| 58 |
+
|
| 59 |
+
def __init__(
|
| 60 |
+
self,
|
| 61 |
+
content: Any = None,
|
| 62 |
+
structured_content: Any = None,
|
| 63 |
+
meta: Any = None,
|
| 64 |
+
data: Any = None,
|
| 65 |
+
is_error: bool = False,
|
| 66 |
+
) -> None:
|
| 67 |
+
self.content = content
|
| 68 |
+
self.structured_content = structured_content
|
| 69 |
+
self.meta = meta
|
| 70 |
+
self.data = data
|
| 71 |
+
self.is_error = is_error
|
| 72 |
+
|
| 73 |
+
client_mod.CallToolResult = CallToolResult
|
| 74 |
+
|
| 75 |
+
client_pkg.client = client_mod # type: ignore[attr-defined]
|
| 76 |
+
except Exception:
|
| 77 |
+
pass
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
install_openenv_fastmcp_compat()
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
from openenv.core.env_server.http_server import create_app as openenv_create_app
|
| 84 |
+
from openenv.core.env_server.interfaces import Environment
|
| 85 |
+
from openenv.core.env_server.types import Action, Observation, State
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
create_app = openenv_create_app
|
| 89 |
+
|
server/env_safe.py
CHANGED
|
@@ -5,18 +5,32 @@ from __future__ import annotations
|
|
| 5 |
from typing import Any, Optional
|
| 6 |
from uuid import uuid4
|
| 7 |
|
| 8 |
-
|
| 9 |
-
from
|
| 10 |
-
from
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
|
| 22 |
INVALID_ACTION_PENALTY = 0.10
|
|
@@ -489,4 +503,3 @@ class PythonCodeReviewEnvironment(
|
|
| 489 |
|
| 490 |
PythonEnvironment = PythonCodeReviewEnvironment
|
| 491 |
CodeReviewEnvironment = PythonCodeReviewEnvironment
|
| 492 |
-
|
|
|
|
| 5 |
from typing import Any, Optional
|
| 6 |
from uuid import uuid4
|
| 7 |
|
| 8 |
+
try:
|
| 9 |
+
from compat import Environment
|
| 10 |
+
from graders import grade_task
|
| 11 |
+
from models import (
|
| 12 |
+
HealthResponse,
|
| 13 |
+
HistoryEntry,
|
| 14 |
+
PythonCodeReviewAction,
|
| 15 |
+
PythonCodeReviewObservation,
|
| 16 |
+
PythonCodeReviewState,
|
| 17 |
+
RewardDetails,
|
| 18 |
+
TaskGrade,
|
| 19 |
+
)
|
| 20 |
+
from tasks import TaskSpec, get_task as load_task, list_task_summaries, task_ids
|
| 21 |
+
except Exception:
|
| 22 |
+
from .compat import Environment
|
| 23 |
+
from .graders import grade_task
|
| 24 |
+
from .models import (
|
| 25 |
+
HealthResponse,
|
| 26 |
+
HistoryEntry,
|
| 27 |
+
PythonCodeReviewAction,
|
| 28 |
+
PythonCodeReviewObservation,
|
| 29 |
+
PythonCodeReviewState,
|
| 30 |
+
RewardDetails,
|
| 31 |
+
TaskGrade,
|
| 32 |
+
)
|
| 33 |
+
from .tasks import TaskSpec, get_task as load_task, list_task_summaries, task_ids
|
| 34 |
|
| 35 |
|
| 36 |
INVALID_ACTION_PENALTY = 0.10
|
|
|
|
| 503 |
|
| 504 |
PythonEnvironment = PythonCodeReviewEnvironment
|
| 505 |
CodeReviewEnvironment = PythonCodeReviewEnvironment
|
|
|
server/graders/__init__.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Deterministic graders for self-contained server builds."""
|
| 2 |
+
|
| 3 |
+
from .common import clamp_score
|
| 4 |
+
from .optimization import grade_optimization_task
|
| 5 |
+
from .pytest_runner import PytestExecution, run_pytest_suite
|
| 6 |
+
from .syntax import grade_bug_fix_task, grade_syntax_task, grade_task
|
| 7 |
+
|
| 8 |
+
__all__ = [
|
| 9 |
+
"PytestExecution",
|
| 10 |
+
"clamp_score",
|
| 11 |
+
"grade_bug_fix_task",
|
| 12 |
+
"grade_optimization_task",
|
| 13 |
+
"grade_syntax_task",
|
| 14 |
+
"grade_task",
|
| 15 |
+
"run_pytest_suite",
|
| 16 |
+
]
|
| 17 |
+
|
server/graders/common.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Shared deterministic scoring helpers."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import ast
|
| 6 |
+
import difflib
|
| 7 |
+
import traceback
|
| 8 |
+
from typing import Tuple
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def clamp_score(value: float) -> float:
|
| 12 |
+
return max(0.0, min(1.0, round(value, 6)))
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def syntax_error_message(code: str) -> str:
|
| 16 |
+
try:
|
| 17 |
+
ast.parse(code)
|
| 18 |
+
except SyntaxError as exc:
|
| 19 |
+
return f"{exc.msg} (line {exc.lineno}, column {exc.offset})"
|
| 20 |
+
except Exception:
|
| 21 |
+
return traceback.format_exc(limit=1).strip()
|
| 22 |
+
return ""
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def compiles(code: str) -> bool:
|
| 26 |
+
try:
|
| 27 |
+
compile(code, "<candidate>", "exec")
|
| 28 |
+
except Exception:
|
| 29 |
+
return False
|
| 30 |
+
return True
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def normalized_diff_score(code: str, reference_code: str) -> float:
|
| 34 |
+
ratio = difflib.SequenceMatcher(
|
| 35 |
+
a="".join(code.split()),
|
| 36 |
+
b="".join(reference_code.split()),
|
| 37 |
+
).ratio()
|
| 38 |
+
return clamp_score(ratio)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def style_score(code: str, max_line_length: int = 88) -> float:
|
| 42 |
+
lines = code.splitlines() or [""]
|
| 43 |
+
line_length_ok = sum(1 for line in lines if len(line) <= max_line_length) / len(lines)
|
| 44 |
+
tab_ok = 1.0 if all("\t" not in line for line in lines) else 0.0
|
| 45 |
+
trailing_ws_ok = 1.0 if all(line == line.rstrip() for line in lines) else 0.0
|
| 46 |
+
return clamp_score((line_length_ok * 0.6) + (tab_ok * 0.2) + (trailing_ws_ok * 0.2))
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def nested_loop_depth(tree: ast.AST) -> int:
|
| 50 |
+
best = 0
|
| 51 |
+
|
| 52 |
+
def walk(node: ast.AST, depth: int) -> None:
|
| 53 |
+
nonlocal best
|
| 54 |
+
if isinstance(node, (ast.For, ast.AsyncFor, ast.While)):
|
| 55 |
+
depth += 1
|
| 56 |
+
best = max(best, depth)
|
| 57 |
+
for child in ast.iter_child_nodes(node):
|
| 58 |
+
walk(child, depth)
|
| 59 |
+
|
| 60 |
+
walk(tree, 0)
|
| 61 |
+
return best
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def compile_tree(code: str) -> Tuple[ast.AST | None, str]:
|
| 65 |
+
try:
|
| 66 |
+
return ast.parse(code), ""
|
| 67 |
+
except SyntaxError as exc:
|
| 68 |
+
return None, f"{exc.msg} (line {exc.lineno}, column {exc.offset})"
|
| 69 |
+
|
server/graders/optimization.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Deterministic grading for optimization tasks."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
import subprocess
|
| 7 |
+
import sys
|
| 8 |
+
import tempfile
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
from .common import clamp_score, compile_tree, nested_loop_depth, style_score
|
| 12 |
+
from .pytest_runner import run_pytest_suite
|
| 13 |
+
from ..models import TaskGrade
|
| 14 |
+
from ..tasks.task_bank import TaskSpec
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def _benchmark_script(task: TaskSpec) -> str:
|
| 18 |
+
return f"""import json
|
| 19 |
+
import time
|
| 20 |
+
from candidate import {task.benchmark_entrypoint}
|
| 21 |
+
|
| 22 |
+
{task.benchmark_builder}
|
| 23 |
+
|
| 24 |
+
events = build_benchmark_events()
|
| 25 |
+
start = time.perf_counter()
|
| 26 |
+
for _ in range({task.benchmark_repeats}):
|
| 27 |
+
result = {task.benchmark_entrypoint}(events)
|
| 28 |
+
elapsed = time.perf_counter() - start
|
| 29 |
+
Path = __import__("pathlib").Path
|
| 30 |
+
Path("benchmark.json").write_text(json.dumps({{"elapsed": elapsed, "rows": len(result)}}), encoding="utf-8")
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def benchmark_runtime(candidate_code: str, task: TaskSpec) -> tuple[float, bool, str]:
|
| 35 |
+
assert task.benchmark_entrypoint is not None
|
| 36 |
+
try:
|
| 37 |
+
with tempfile.TemporaryDirectory(prefix="python-code-review-bench-") as temp_dir:
|
| 38 |
+
temp_path = Path(temp_dir)
|
| 39 |
+
(temp_path / "candidate.py").write_text(candidate_code, encoding="utf-8")
|
| 40 |
+
(temp_path / "starter.py").write_text(task.starter_code, encoding="utf-8")
|
| 41 |
+
(temp_path / "candidate_runner.py").write_text(_benchmark_script(task), encoding="utf-8")
|
| 42 |
+
starter_script = _benchmark_script(task).replace("from candidate import", "from starter import")
|
| 43 |
+
(temp_path / "starter_runner.py").write_text(starter_script, encoding="utf-8")
|
| 44 |
+
|
| 45 |
+
try:
|
| 46 |
+
starter_run = subprocess.run(
|
| 47 |
+
[sys.executable, "starter_runner.py"],
|
| 48 |
+
cwd=temp_path,
|
| 49 |
+
capture_output=True,
|
| 50 |
+
text=True,
|
| 51 |
+
timeout=task.benchmark_timeout_s,
|
| 52 |
+
check=False,
|
| 53 |
+
)
|
| 54 |
+
starter_payload = json.loads((temp_path / "benchmark.json").read_text(encoding="utf-8"))
|
| 55 |
+
candidate_run = subprocess.run(
|
| 56 |
+
[sys.executable, "candidate_runner.py"],
|
| 57 |
+
cwd=temp_path,
|
| 58 |
+
capture_output=True,
|
| 59 |
+
text=True,
|
| 60 |
+
timeout=task.benchmark_timeout_s,
|
| 61 |
+
check=False,
|
| 62 |
+
)
|
| 63 |
+
candidate_payload = json.loads((temp_path / "benchmark.json").read_text(encoding="utf-8"))
|
| 64 |
+
except subprocess.TimeoutExpired as exc:
|
| 65 |
+
output = (exc.stdout or "") + (exc.stderr or "")
|
| 66 |
+
return 0.0, True, (output or "benchmark timed out").strip()
|
| 67 |
+
except Exception as exc:
|
| 68 |
+
return 0.0, False, str(exc)
|
| 69 |
+
|
| 70 |
+
starter_elapsed = max(float(starter_payload["elapsed"]), 1e-9)
|
| 71 |
+
candidate_elapsed = max(float(candidate_payload["elapsed"]), 1e-9)
|
| 72 |
+
speedup = starter_elapsed / candidate_elapsed
|
| 73 |
+
runtime_score = clamp_score(min((speedup - 1.0) / 3.0, 1.0))
|
| 74 |
+
output = "\n".join(
|
| 75 |
+
part
|
| 76 |
+
for part in [
|
| 77 |
+
starter_run.stdout.strip(),
|
| 78 |
+
starter_run.stderr.strip(),
|
| 79 |
+
candidate_run.stdout.strip(),
|
| 80 |
+
candidate_run.stderr.strip(),
|
| 81 |
+
f"starter={starter_elapsed:.6f}s candidate={candidate_elapsed:.6f}s speedup={speedup:.2f}x",
|
| 82 |
+
]
|
| 83 |
+
if part
|
| 84 |
+
)
|
| 85 |
+
return runtime_score, False, output
|
| 86 |
+
except Exception as exc:
|
| 87 |
+
return 0.0, False, str(exc)
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def ast_quality_score(code: str, task: TaskSpec) -> float:
|
| 91 |
+
tree, _ = compile_tree(code)
|
| 92 |
+
if tree is None:
|
| 93 |
+
return 0.0
|
| 94 |
+
import ast
|
| 95 |
+
|
| 96 |
+
function_node = next((node for node in tree.body if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef))), None)
|
| 97 |
+
docstring_points = 0.2 if function_node and ast.get_docstring(function_node, clean=False) else 0.0
|
| 98 |
+
nested_points = 0.4 if nested_loop_depth(tree) <= 1 else 0.0
|
| 99 |
+
marker_points = 0.0
|
| 100 |
+
for marker in task.expected_quality_markers:
|
| 101 |
+
if marker in code:
|
| 102 |
+
marker_points += 0.2
|
| 103 |
+
return clamp_score(docstring_points + nested_points + marker_points)
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def grade_optimization_task(candidate_code: str, task: TaskSpec) -> TaskGrade:
|
| 107 |
+
execution = run_pytest_suite(candidate_code, [*task.visible_tests, *task.hidden_tests], timeout_s=task.benchmark_timeout_s)
|
| 108 |
+
test_fraction = execution.passed / execution.total if execution.total else 0.0
|
| 109 |
+
|
| 110 |
+
if execution.timed_out:
|
| 111 |
+
return TaskGrade(score=0.0, tests_passed=execution.passed, tests_total=execution.total, timed_out=True, details={"tests": execution.output})
|
| 112 |
+
|
| 113 |
+
runtime_score, timed_out, benchmark_output = benchmark_runtime(candidate_code, task)
|
| 114 |
+
if timed_out:
|
| 115 |
+
return TaskGrade(score=0.0, tests_passed=execution.passed, tests_total=execution.total, timed_out=True, details={"tests": execution.output, "benchmark": benchmark_output})
|
| 116 |
+
|
| 117 |
+
quality_score = ast_quality_score(candidate_code, task)
|
| 118 |
+
pep8_score = style_score(candidate_code, task.style_max_line_length)
|
| 119 |
+
score = clamp_score((0.5 * test_fraction) + (0.3 * runtime_score) + (0.15 * quality_score) + (0.05 * pep8_score))
|
| 120 |
+
return TaskGrade(
|
| 121 |
+
score=score,
|
| 122 |
+
syntax_score=1.0,
|
| 123 |
+
tests_passed=execution.passed,
|
| 124 |
+
tests_total=execution.total,
|
| 125 |
+
quality_score=quality_score,
|
| 126 |
+
runtime_score=runtime_score,
|
| 127 |
+
details={
|
| 128 |
+
"tests": execution.output,
|
| 129 |
+
"benchmark": benchmark_output,
|
| 130 |
+
"test_fraction": round(test_fraction, 4),
|
| 131 |
+
"runtime_score": round(runtime_score, 4),
|
| 132 |
+
"style_score": round(pep8_score, 4),
|
| 133 |
+
},
|
| 134 |
+
)
|
| 135 |
+
|
server/graders/pytest_runner.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Helpers for deterministic pytest execution in temp sandboxes."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
import subprocess
|
| 7 |
+
import sys
|
| 8 |
+
import tempfile
|
| 9 |
+
from dataclasses import dataclass
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Iterable
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@dataclass(frozen=True)
|
| 15 |
+
class PytestExecution:
|
| 16 |
+
passed: int
|
| 17 |
+
failed: int
|
| 18 |
+
total: int
|
| 19 |
+
timed_out: bool
|
| 20 |
+
output: str
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def _test_module_source(tests: Iterable[str]) -> str:
|
| 24 |
+
blocks: list[str] = ["from candidate import * # noqa: F401,F403"]
|
| 25 |
+
for index, test in enumerate(tests, start=1):
|
| 26 |
+
snippet = str(test).strip()
|
| 27 |
+
if not snippet:
|
| 28 |
+
continue
|
| 29 |
+
if snippet.startswith("def test_"):
|
| 30 |
+
blocks.append(snippet)
|
| 31 |
+
continue
|
| 32 |
+
blocks.append(
|
| 33 |
+
"\n".join(
|
| 34 |
+
[
|
| 35 |
+
f"def test_case_{index:03d}():",
|
| 36 |
+
f" assert {snippet}",
|
| 37 |
+
]
|
| 38 |
+
)
|
| 39 |
+
)
|
| 40 |
+
return "\n\n".join(blocks) or "def test_placeholder():\n assert True\n"
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def _runner_script() -> str:
|
| 44 |
+
return """import json
|
| 45 |
+
import pathlib
|
| 46 |
+
import pytest
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
class Collector:
|
| 50 |
+
def __init__(self) -> None:
|
| 51 |
+
self.passed = 0
|
| 52 |
+
self.failed = 0
|
| 53 |
+
|
| 54 |
+
def pytest_runtest_logreport(self, report):
|
| 55 |
+
if report.when != "call":
|
| 56 |
+
return
|
| 57 |
+
if report.passed:
|
| 58 |
+
self.passed += 1
|
| 59 |
+
elif report.failed:
|
| 60 |
+
self.failed += 1
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
collector = Collector()
|
| 64 |
+
exit_code = pytest.main(["-q", "test_candidate.py"], plugins=[collector])
|
| 65 |
+
payload = {
|
| 66 |
+
"passed": collector.passed,
|
| 67 |
+
"failed": collector.failed,
|
| 68 |
+
"exit_code": int(exit_code),
|
| 69 |
+
}
|
| 70 |
+
pathlib.Path("pytest_results.json").write_text(json.dumps(payload), encoding="utf-8")
|
| 71 |
+
"""
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def run_pytest_suite(candidate_code: str, tests: Iterable[str], timeout_s: float = 3.0) -> PytestExecution:
|
| 75 |
+
test_cases = list(tests)
|
| 76 |
+
try:
|
| 77 |
+
with tempfile.TemporaryDirectory(prefix="python-code-review-") as temp_dir:
|
| 78 |
+
temp_path = Path(temp_dir)
|
| 79 |
+
(temp_path / "candidate.py").write_text(candidate_code, encoding="utf-8")
|
| 80 |
+
(temp_path / "test_candidate.py").write_text(_test_module_source(test_cases), encoding="utf-8")
|
| 81 |
+
(temp_path / "runner.py").write_text(_runner_script(), encoding="utf-8")
|
| 82 |
+
|
| 83 |
+
try:
|
| 84 |
+
completed = subprocess.run(
|
| 85 |
+
[sys.executable, "runner.py"],
|
| 86 |
+
cwd=temp_path,
|
| 87 |
+
capture_output=True,
|
| 88 |
+
text=True,
|
| 89 |
+
timeout=timeout_s,
|
| 90 |
+
check=False,
|
| 91 |
+
)
|
| 92 |
+
except subprocess.TimeoutExpired as exc:
|
| 93 |
+
output = (exc.stdout or "") + (exc.stderr or "")
|
| 94 |
+
return PytestExecution(
|
| 95 |
+
passed=0,
|
| 96 |
+
failed=max(len(test_cases), 1),
|
| 97 |
+
total=max(len(test_cases), 1),
|
| 98 |
+
timed_out=True,
|
| 99 |
+
output=(output or "pytest timed out").strip(),
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
result_path = temp_path / "pytest_results.json"
|
| 103 |
+
if not result_path.exists():
|
| 104 |
+
output = (completed.stdout or "") + (completed.stderr or "")
|
| 105 |
+
total = max(len(test_cases), 1)
|
| 106 |
+
return PytestExecution(0, total, total, False, output.strip())
|
| 107 |
+
|
| 108 |
+
try:
|
| 109 |
+
payload = json.loads(result_path.read_text(encoding="utf-8"))
|
| 110 |
+
except Exception as exc:
|
| 111 |
+
output = ((completed.stdout or "") + (completed.stderr or "")).strip()
|
| 112 |
+
return PytestExecution(0, max(len(test_cases), 1), max(len(test_cases), 1), False, (output or str(exc)).strip())
|
| 113 |
+
|
| 114 |
+
passed = int(payload.get("passed", 0))
|
| 115 |
+
failed = int(payload.get("failed", 0))
|
| 116 |
+
total = max(passed + failed, len(test_cases))
|
| 117 |
+
output = ((completed.stdout or "") + (completed.stderr or "")).strip()
|
| 118 |
+
return PytestExecution(passed, failed, total, False, output)
|
| 119 |
+
except Exception as exc:
|
| 120 |
+
return PytestExecution(0, max(len(test_cases), 1), max(len(test_cases), 1), False, str(exc))
|
| 121 |
+
|
server/graders/syntax.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Task graders for syntax and bug-fix tasks."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from .common import clamp_score, compiles, normalized_diff_score, style_score, syntax_error_message
|
| 6 |
+
from .optimization import grade_optimization_task
|
| 7 |
+
from .pytest_runner import run_pytest_suite
|
| 8 |
+
from ..models import TaskGrade
|
| 9 |
+
from ..tasks.task_bank import TaskSpec
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def grade_syntax_task(candidate_code: str, task: TaskSpec) -> TaskGrade:
|
| 13 |
+
error = syntax_error_message(candidate_code)
|
| 14 |
+
diff_score = normalized_diff_score(candidate_code, task.reference_code)
|
| 15 |
+
style_base = style_score(candidate_code, task.style_max_line_length)
|
| 16 |
+
if not error:
|
| 17 |
+
return TaskGrade(score=1.0, syntax_score=1.0, quality_score=style_base, details={"compile_error": ""})
|
| 18 |
+
partial = clamp_score(0.15 + (0.55 * diff_score))
|
| 19 |
+
return TaskGrade(score=partial, syntax_score=0.0, quality_score=diff_score * style_base, details={"compile_error": error})
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def grade_bug_fix_task(candidate_code: str, task: TaskSpec, include_hidden: bool = True) -> TaskGrade:
|
| 23 |
+
if not compiles(candidate_code):
|
| 24 |
+
error = syntax_error_message(candidate_code)
|
| 25 |
+
return TaskGrade(score=0.0, syntax_score=0.0, details={"compile_error": error})
|
| 26 |
+
|
| 27 |
+
tests = list(task.visible_tests)
|
| 28 |
+
if include_hidden:
|
| 29 |
+
tests.extend(task.hidden_tests)
|
| 30 |
+
|
| 31 |
+
execution = run_pytest_suite(candidate_code, tests, timeout_s=3.0)
|
| 32 |
+
if execution.timed_out:
|
| 33 |
+
return TaskGrade(
|
| 34 |
+
score=0.0,
|
| 35 |
+
syntax_score=1.0,
|
| 36 |
+
tests_passed=execution.passed,
|
| 37 |
+
tests_total=execution.total,
|
| 38 |
+
timed_out=True,
|
| 39 |
+
details={"compile_error": "", "tests": execution.output},
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
pass_fraction = execution.passed / execution.total if execution.total else 0.0
|
| 43 |
+
quality = style_score(candidate_code, task.style_max_line_length)
|
| 44 |
+
return TaskGrade(
|
| 45 |
+
score=clamp_score(pass_fraction),
|
| 46 |
+
syntax_score=1.0,
|
| 47 |
+
tests_passed=execution.passed,
|
| 48 |
+
tests_total=execution.total,
|
| 49 |
+
quality_score=quality,
|
| 50 |
+
details={"compile_error": "", "tests": execution.output},
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def grade_task(candidate_code: str, task: TaskSpec, include_hidden: bool = True) -> TaskGrade:
|
| 55 |
+
if task.task_kind == "syntax_fix":
|
| 56 |
+
return grade_syntax_task(candidate_code, task)
|
| 57 |
+
if task.task_kind == "bug_fix":
|
| 58 |
+
return grade_bug_fix_task(candidate_code, task, include_hidden=include_hidden)
|
| 59 |
+
return grade_optimization_task(candidate_code, task)
|
| 60 |
+
|
server/models.py
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Typed models for the self-contained server package."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Any, Dict, List, Literal, Optional
|
| 6 |
+
|
| 7 |
+
from pydantic import BaseModel, Field
|
| 8 |
+
|
| 9 |
+
from .compat import Action, Observation, State
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
Difficulty = Literal["easy", "medium", "hard"]
|
| 13 |
+
TaskKind = Literal["syntax_fix", "bug_fix", "optimization"]
|
| 14 |
+
ActionType = Literal["analyze_code", "edit_code", "run_tests", "submit_solution"]
|
| 15 |
+
Category = Literal["bug", "security", "performance", "maintainability", "style", "testing"]
|
| 16 |
+
Severity = Literal["critical", "warning", "info"]
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class HistoryEntry(BaseModel):
|
| 20 |
+
step: int = Field(..., ge=0)
|
| 21 |
+
action_type: ActionType
|
| 22 |
+
status: str
|
| 23 |
+
reward: float
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class RewardDetails(BaseModel):
|
| 27 |
+
value: float
|
| 28 |
+
syntax_reward: float = 0.0
|
| 29 |
+
test_reward: float = 0.0
|
| 30 |
+
quality_bonus: float = 0.0
|
| 31 |
+
correctness_bonus: float = 0.0
|
| 32 |
+
progress_delta: float = 0.0
|
| 33 |
+
stagnation_penalty: float = 0.0
|
| 34 |
+
regression_penalty: float = 0.0
|
| 35 |
+
invalid_action_penalty: float = 0.0
|
| 36 |
+
timeout_penalty: float = 0.0
|
| 37 |
+
reason: str
|
| 38 |
+
prev_score: float = 0.0
|
| 39 |
+
curr_score: float = 0.0
|
| 40 |
+
code_changed: bool = False
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class PythonCodeReviewAction(Action):
|
| 44 |
+
action_type: ActionType
|
| 45 |
+
code: Optional[str] = None
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
class PythonCodeReviewObservation(Observation):
|
| 49 |
+
task_id: str
|
| 50 |
+
title: str = ""
|
| 51 |
+
difficulty: Difficulty
|
| 52 |
+
task_kind: Optional[TaskKind] = None
|
| 53 |
+
task_description: str
|
| 54 |
+
current_code: str
|
| 55 |
+
errors: str
|
| 56 |
+
test_results: str
|
| 57 |
+
visible_tests: List[str] = Field(default_factory=list)
|
| 58 |
+
history: List[HistoryEntry] = Field(default_factory=list)
|
| 59 |
+
attempts_remaining: int = Field(..., ge=0)
|
| 60 |
+
last_action_status: str = ""
|
| 61 |
+
score: float = Field(..., ge=0.0, le=1.0)
|
| 62 |
+
reward_details: RewardDetails = Field(
|
| 63 |
+
default_factory=lambda: RewardDetails(value=0.0, reason="Reset")
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
class PythonCodeReviewState(State):
|
| 68 |
+
episode_id: str
|
| 69 |
+
step_count: int = Field(default=0, ge=0)
|
| 70 |
+
task_id: Optional[str] = None
|
| 71 |
+
difficulty: Optional[Difficulty] = None
|
| 72 |
+
task_kind: Optional[TaskKind] = None
|
| 73 |
+
attempts_remaining: int = Field(default=0, ge=0)
|
| 74 |
+
current_code: str = ""
|
| 75 |
+
errors: str = ""
|
| 76 |
+
test_results: str = ""
|
| 77 |
+
history: List[HistoryEntry] = Field(default_factory=list)
|
| 78 |
+
score: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 79 |
+
done: bool = False
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
class TaskDescriptor(BaseModel):
|
| 83 |
+
task_id: str
|
| 84 |
+
title: str
|
| 85 |
+
difficulty: Difficulty
|
| 86 |
+
task_kind: Optional[TaskKind] = None
|
| 87 |
+
task_description: str = ""
|
| 88 |
+
starter_code: str = ""
|
| 89 |
+
visible_tests: List[str] = Field(default_factory=list)
|
| 90 |
+
goal: str = ""
|
| 91 |
+
repo_summary: str = ""
|
| 92 |
+
changed_files: List[str] = Field(default_factory=list)
|
| 93 |
+
available_files: List[str] = Field(default_factory=list)
|
| 94 |
+
max_steps: int = Field(..., ge=1)
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
class TaskSummary(BaseModel):
|
| 98 |
+
task_id: str
|
| 99 |
+
difficulty: Difficulty
|
| 100 |
+
title: str
|
| 101 |
+
goal: str = ""
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
class ReviewFinding(BaseModel):
|
| 105 |
+
title: str
|
| 106 |
+
file_path: str = ""
|
| 107 |
+
line: Optional[int] = Field(default=None, ge=1)
|
| 108 |
+
category: Category = "bug"
|
| 109 |
+
severity: Severity = "warning"
|
| 110 |
+
rationale: str = ""
|
| 111 |
+
recommendation: str = ""
|
| 112 |
+
rule_id: str = ""
|
| 113 |
+
|
| 114 |
+
@property
|
| 115 |
+
def explanation(self) -> str:
|
| 116 |
+
return self.rationale
|
| 117 |
+
|
| 118 |
+
@property
|
| 119 |
+
def suggested_fix(self) -> str:
|
| 120 |
+
return self.recommendation
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
class DirectReviewResponse(BaseModel):
|
| 124 |
+
issues: List[ReviewFinding] = Field(default_factory=list)
|
| 125 |
+
summary: str = ""
|
| 126 |
+
score: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 127 |
+
improved_code: Optional[str] = None
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
class TaskGrade(BaseModel):
|
| 131 |
+
score: float = Field(..., ge=0.0, le=1.0)
|
| 132 |
+
syntax_score: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 133 |
+
tests_passed: int = Field(default=0, ge=0)
|
| 134 |
+
tests_total: int = Field(default=0, ge=0)
|
| 135 |
+
quality_score: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 136 |
+
runtime_score: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 137 |
+
timed_out: bool = False
|
| 138 |
+
matched_issue_ids: List[str] = Field(default_factory=list)
|
| 139 |
+
false_positives: int = Field(default=0, ge=0)
|
| 140 |
+
duplicate_findings: int = Field(default=0, ge=0)
|
| 141 |
+
matched_weight: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 142 |
+
details: Dict[str, Any] = Field(default_factory=dict)
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
class HealthResponse(BaseModel):
|
| 146 |
+
status: Literal["ok"] = "ok"
|
| 147 |
+
environment: str = "python_code_review_env"
|
| 148 |
+
task_count: int = Field(default=0, ge=0)
|
| 149 |
+
|
server/tasks/__init__.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Self-contained task definitions for container builds."""
|
| 2 |
+
|
| 3 |
+
from .task_bank import TaskSpec, get_task, list_task_descriptors, list_task_summaries, task_ids
|
| 4 |
+
|
| 5 |
+
__all__ = [
|
| 6 |
+
"TaskSpec",
|
| 7 |
+
"get_task",
|
| 8 |
+
"list_task_descriptors",
|
| 9 |
+
"list_task_summaries",
|
| 10 |
+
"task_ids",
|
| 11 |
+
]
|
| 12 |
+
|
server/tasks/task_bank.py
ADDED
|
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Deterministic task bank for self-contained server builds."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from dataclasses import dataclass, field
|
| 6 |
+
from typing import Dict, List, Optional
|
| 7 |
+
|
| 8 |
+
from ..models import Difficulty, TaskDescriptor, TaskKind
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@dataclass(frozen=True)
|
| 12 |
+
class TaskSpec:
|
| 13 |
+
task_id: str
|
| 14 |
+
title: str
|
| 15 |
+
difficulty: Difficulty
|
| 16 |
+
task_kind: TaskKind
|
| 17 |
+
task_description: str
|
| 18 |
+
starter_code: str
|
| 19 |
+
reference_code: str
|
| 20 |
+
visible_tests: List[str]
|
| 21 |
+
hidden_tests: List[str]
|
| 22 |
+
max_steps: int = 10
|
| 23 |
+
benchmark_entrypoint: Optional[str] = None
|
| 24 |
+
benchmark_builder: Optional[str] = None
|
| 25 |
+
benchmark_repeats: int = 1
|
| 26 |
+
benchmark_timeout_s: float = 2.0
|
| 27 |
+
style_max_line_length: int = 88
|
| 28 |
+
expected_quality_markers: List[str] = field(default_factory=list)
|
| 29 |
+
|
| 30 |
+
def to_descriptor(self) -> TaskDescriptor:
|
| 31 |
+
return TaskDescriptor(
|
| 32 |
+
task_id=self.task_id,
|
| 33 |
+
title=self.title,
|
| 34 |
+
difficulty=self.difficulty,
|
| 35 |
+
task_kind=self.task_kind,
|
| 36 |
+
task_description=self.task_description,
|
| 37 |
+
starter_code=self.starter_code,
|
| 38 |
+
visible_tests=list(self.visible_tests),
|
| 39 |
+
max_steps=self.max_steps,
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
TASK_SYNTAX_FIX = TaskSpec(
|
| 44 |
+
task_id="syntax-fix-easy",
|
| 45 |
+
title="Fix a syntax-broken username normalizer",
|
| 46 |
+
difficulty="easy",
|
| 47 |
+
task_kind="syntax_fix",
|
| 48 |
+
task_description=(
|
| 49 |
+
"You are reviewing a utility function before merge. The submitted patch left "
|
| 50 |
+
"the function with syntax errors. Repair the code so it compiles and preserves "
|
| 51 |
+
"the intended behavior of trimming, lowercasing, and replacing spaces with underscores."
|
| 52 |
+
),
|
| 53 |
+
starter_code='''def normalize_username(raw_name: str) -> str:
|
| 54 |
+
cleaned = raw_name.strip().lower(
|
| 55 |
+
if not cleaned:
|
| 56 |
+
return "anonymous"
|
| 57 |
+
return cleaned.replace(" ", "_")
|
| 58 |
+
''',
|
| 59 |
+
reference_code='''def normalize_username(raw_name: str) -> str:
|
| 60 |
+
cleaned = raw_name.strip().lower()
|
| 61 |
+
if not cleaned:
|
| 62 |
+
return "anonymous"
|
| 63 |
+
return cleaned.replace(" ", "_")
|
| 64 |
+
''',
|
| 65 |
+
visible_tests=[
|
| 66 |
+
"normalize_username(' Alice Smith ') == 'alice_smith'",
|
| 67 |
+
"normalize_username(' ') == 'anonymous'",
|
| 68 |
+
"normalize_username('Bob') == 'bob'",
|
| 69 |
+
],
|
| 70 |
+
hidden_tests=[
|
| 71 |
+
"normalize_username(' HELLO WORLD ') == 'hello_world'",
|
| 72 |
+
"normalize_username('') == 'anonymous'",
|
| 73 |
+
],
|
| 74 |
+
max_steps=8,
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
TASK_BUG_FIX = TaskSpec(
|
| 79 |
+
task_id="bug-fix-medium",
|
| 80 |
+
title="Repair invoice discount calculation logic",
|
| 81 |
+
difficulty="medium",
|
| 82 |
+
task_kind="bug_fix",
|
| 83 |
+
task_description=(
|
| 84 |
+
"A billing helper function is returning the wrong amount after applying discounts. "
|
| 85 |
+
"The function signature is correct, but the calculation logic is broken. "
|
| 86 |
+
"Inspect the implementation, run visible tests, and fix the bug so all tests pass. "
|
| 87 |
+
"Do not change the function signature or validation logic."
|
| 88 |
+
),
|
| 89 |
+
starter_code='''from typing import Iterable
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def calculate_invoice_total(line_items: Iterable[int], discount_percent: int) -> int:
|
| 93 |
+
"""Calculate invoice total with discount applied."""
|
| 94 |
+
if discount_percent < 0 or discount_percent > 100:
|
| 95 |
+
raise ValueError("discount_percent must be between 0 and 100")
|
| 96 |
+
|
| 97 |
+
subtotal = sum(line_items)
|
| 98 |
+
discounted_total = subtotal - (subtotal * discount_percent // 100)
|
| 99 |
+
return subtotal
|
| 100 |
+
''',
|
| 101 |
+
reference_code='''from typing import Iterable
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def calculate_invoice_total(line_items: Iterable[int], discount_percent: int) -> int:
|
| 105 |
+
"""Calculate invoice total with discount applied."""
|
| 106 |
+
if discount_percent < 0 or discount_percent > 100:
|
| 107 |
+
raise ValueError("discount_percent must be between 0 and 100")
|
| 108 |
+
|
| 109 |
+
subtotal = sum(line_items)
|
| 110 |
+
discounted_total = subtotal - (subtotal * discount_percent // 100)
|
| 111 |
+
return discounted_total
|
| 112 |
+
''',
|
| 113 |
+
visible_tests=[
|
| 114 |
+
"calculate_invoice_total([1000, 2000], 0) == 3000",
|
| 115 |
+
"calculate_invoice_total([1000, 2000], 50) == 1500",
|
| 116 |
+
"calculate_invoice_total([1000], 10) == 900",
|
| 117 |
+
"calculate_invoice_total([], 0) == 0",
|
| 118 |
+
],
|
| 119 |
+
hidden_tests=[
|
| 120 |
+
"calculate_invoice_total([100, 200, 300], 25) == 450",
|
| 121 |
+
"calculate_invoice_total([5000], 99) == 50",
|
| 122 |
+
],
|
| 123 |
+
max_steps=10,
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
TASK_OPTIMIZATION = TaskSpec(
|
| 128 |
+
task_id="optimization-hard",
|
| 129 |
+
title="Optimize inefficient user activity summarization",
|
| 130 |
+
difficulty="hard",
|
| 131 |
+
task_kind="optimization",
|
| 132 |
+
task_description=(
|
| 133 |
+
"Code review found that `summarize_user_activity` is inefficient for large event streams. "
|
| 134 |
+
"The current implementation repeatedly scans the full event list for every user, making it O(n**2). "
|
| 135 |
+
"Refactor it to aggregate counts in one pass while preserving the sorted output contract. "
|
| 136 |
+
"Style and code quality also matter: use idiomatic Python, proper types, and clear logic. "
|
| 137 |
+
"All tests must pass, and the optimized version should be measurably faster."
|
| 138 |
+
),
|
| 139 |
+
starter_code='''from typing import Iterable
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def summarize_user_activity(events: Iterable[dict]) -> list[tuple[str, int]]:
|
| 143 |
+
"""Aggregate user activity counts."""
|
| 144 |
+
|
| 145 |
+
ordered_users = []
|
| 146 |
+
for event in events:
|
| 147 |
+
user_id = event["user_id"]
|
| 148 |
+
if user_id not in ordered_users:
|
| 149 |
+
ordered_users.append(user_id)
|
| 150 |
+
|
| 151 |
+
summary = []
|
| 152 |
+
for user_id in ordered_users:
|
| 153 |
+
count = 0
|
| 154 |
+
for event in events:
|
| 155 |
+
if event["user_id"] == user_id:
|
| 156 |
+
count += 1
|
| 157 |
+
summary.append((user_id, count))
|
| 158 |
+
return sorted(summary, key=lambda item: (-item[1], item[0]))
|
| 159 |
+
''',
|
| 160 |
+
reference_code='''from collections import Counter
|
| 161 |
+
from typing import Iterable
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
def summarize_user_activity(events: Iterable[dict]) -> list[tuple[str, int]]:
|
| 165 |
+
"""Aggregate user activity counts in one pass."""
|
| 166 |
+
|
| 167 |
+
counts = Counter(event["user_id"] for event in events)
|
| 168 |
+
return sorted(counts.items(), key=lambda item: (-item[1], item[0]))
|
| 169 |
+
''',
|
| 170 |
+
visible_tests=[
|
| 171 |
+
"summarize_user_activity([{'user_id': 'alice'}, {'user_id': 'bob'}, {'user_id': 'alice'}]) == [('alice', 2), ('bob', 1)]",
|
| 172 |
+
"summarize_user_activity([{'user_id': 'z'}, {'user_id': 'a'}]) == [('a', 1), ('z', 1)]",
|
| 173 |
+
"summarize_user_activity([]) == []",
|
| 174 |
+
"summarize_user_activity([{'user_id': 'solo'}]) == [('solo', 1)]",
|
| 175 |
+
],
|
| 176 |
+
hidden_tests=[
|
| 177 |
+
"summarize_user_activity([{'user_id': 'u2'}, {'user_id': 'u1'}, {'user_id': 'u2'}, {'user_id': 'u2'}, {'user_id': 'u1'}]) == [('u2', 3), ('u1', 2)]",
|
| 178 |
+
],
|
| 179 |
+
max_steps=10,
|
| 180 |
+
benchmark_entrypoint="summarize_user_activity",
|
| 181 |
+
benchmark_builder='''def build_benchmark_events():
|
| 182 |
+
return [{"user_id": f"user_{index % 400}"} for index in range(6000)]''',
|
| 183 |
+
benchmark_repeats=3,
|
| 184 |
+
benchmark_timeout_s=1.0,
|
| 185 |
+
style_max_line_length=88,
|
| 186 |
+
expected_quality_markers=["Counter", "sorted"],
|
| 187 |
+
)
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
TASKS: Dict[str, TaskSpec] = {
|
| 191 |
+
"syntax-fix-easy": TASK_SYNTAX_FIX,
|
| 192 |
+
"bug-fix-medium": TASK_BUG_FIX,
|
| 193 |
+
"optimization-hard": TASK_OPTIMIZATION,
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def task_ids() -> List[str]:
|
| 198 |
+
return ["syntax-fix-easy", "bug-fix-medium", "optimization-hard"]
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
def get_task(task_id: str) -> TaskSpec:
|
| 202 |
+
if task_id not in TASKS:
|
| 203 |
+
raise ValueError(f"Task {task_id} not found. Available: {list(TASKS.keys())}")
|
| 204 |
+
return TASKS[task_id]
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
def list_task_descriptors() -> List[TaskDescriptor]:
|
| 208 |
+
return [get_task(tid).to_descriptor() for tid in task_ids()]
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
def list_task_summaries() -> List[TaskDescriptor]:
|
| 212 |
+
return list_task_descriptors()
|
| 213 |
+
|