Spaces:
Build error
Build error
Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- Dockerfile +22 -71
- Project.md +1111 -0
- README.md +264 -258
- REWARD_SYSTEM_GUIDE.md +206 -0
- __init__.py +35 -11
- client.py +70 -41
- compat.py +92 -0
- examples/__init__.py +1 -0
- examples/python_review_examples.py +58 -0
- graders/__init__.py +16 -0
- graders/common.py +82 -0
- graders/optimization.py +167 -0
- graders/pytest_runner.py +149 -0
- graders/syntax.py +78 -0
- inference.py +462 -314
- models.py +185 -221
- openenv.yaml +20 -7
- openenv_python_env.egg-info/PKG-INFO +6 -3
- openenv_python_env.egg-info/SOURCES.txt +13 -5
- openenv_python_env.egg-info/requires.txt +4 -1
- pyproject.toml +33 -46
- pytest-cache-files-1f62ra1g/CACHEDIR.TAG +4 -0
- pytest-cache-files-1f62ra1g/README.md +8 -0
- pytest-cache-files-i2cpw3zw/CACHEDIR.TAG +4 -0
- pytest-cache-files-i2cpw3zw/README.md +8 -0
- pytest-cache-files-le0qcl0z/CACHEDIR.TAG +4 -0
- pytest-cache-files-le0qcl0z/README.md +8 -0
- pytest-cache-files-qm8xzmpt/CACHEDIR.TAG +4 -0
- pytest-cache-files-qm8xzmpt/README.md +8 -0
- pytest-cache-files-qun9v98v/CACHEDIR.TAG +4 -0
- pytest-cache-files-qun9v98v/README.md +8 -0
- pytest-cache-files-srp2otxc/CACHEDIR.TAG +4 -0
- pytest-cache-files-srp2otxc/README.md +8 -0
- pytest-cache-files-u6t7g29i/CACHEDIR.TAG +4 -0
- pytest-cache-files-u6t7g29i/README.md +8 -0
- pytest-cache-files-x1yzwik9/CACHEDIR.TAG +4 -0
- pytest-cache-files-x1yzwik9/README.md +8 -0
- server/__init__.py +5 -11
- server/app.py +114 -81
- server/code_review_env_environment.py +9 -0
- server/code_review_environment.py +5 -0
- server/env.py +1 -0
- server/env_safe.py +492 -0
- server/grading.py +147 -0
- server/python_env_environment.py +9 -421
- server/requirements.txt +6 -6
- server/static_review.py +273 -0
- server/task_bank.py +340 -0
- summary/01_introduction_quickstart.md +66 -0
- summary/02_using_environments.md +98 -0
Dockerfile
CHANGED
|
@@ -1,81 +1,32 @@
|
|
| 1 |
-
|
| 2 |
-
# All rights reserved.
|
| 3 |
-
#
|
| 4 |
-
# This source code is licensed under the BSD-style license found in the
|
| 5 |
-
# LICENSE file in the root directory of this source tree.
|
| 6 |
-
|
| 7 |
-
# Multi-stage build using openenv-base
|
| 8 |
-
# This Dockerfile is flexible and works for both:
|
| 9 |
-
# - In-repo environments (with local OpenEnv sources)
|
| 10 |
-
# - Standalone environments (with openenv from PyPI/Git)
|
| 11 |
-
# The build script (openenv build) handles context detection and sets appropriate build args.
|
| 12 |
-
|
| 13 |
-
ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
|
| 14 |
-
FROM ${BASE_IMAGE} AS builder
|
| 15 |
-
|
| 16 |
-
WORKDIR /app
|
| 17 |
-
|
| 18 |
-
# Ensure git is available (required for installing dependencies from VCS)
|
| 19 |
-
RUN apt-get update && \
|
| 20 |
-
apt-get install -y --no-install-recommends git && \
|
| 21 |
-
rm -rf /var/lib/apt/lists/*
|
| 22 |
-
|
| 23 |
-
# Build argument to control whether we're building standalone or in-repo
|
| 24 |
-
ARG BUILD_MODE=in-repo
|
| 25 |
-
ARG ENV_NAME=python_env
|
| 26 |
-
|
| 27 |
-
# Copy environment code (always at root of build context)
|
| 28 |
-
COPY . /app/env
|
| 29 |
-
|
| 30 |
-
# For in-repo builds, openenv is already vendored in the build context
|
| 31 |
-
# For standalone builds, openenv will be installed via pyproject.toml
|
| 32 |
-
WORKDIR /app/env
|
| 33 |
-
|
| 34 |
-
# Ensure uv is available (for local builds where base image lacks it)
|
| 35 |
-
RUN if ! command -v uv >/dev/null 2>&1; then \
|
| 36 |
-
curl -LsSf https://astral.sh/uv/install.sh | sh && \
|
| 37 |
-
mv /root/.local/bin/uv /usr/local/bin/uv && \
|
| 38 |
-
mv /root/.local/bin/uvx /usr/local/bin/uvx; \
|
| 39 |
-
fi
|
| 40 |
-
|
| 41 |
-
# Install dependencies using uv sync
|
| 42 |
-
# If uv.lock exists, use it; otherwise resolve on the fly
|
| 43 |
-
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 44 |
-
if [ -f uv.lock ]; then \
|
| 45 |
-
uv sync --frozen --no-install-project --no-editable; \
|
| 46 |
-
else \
|
| 47 |
-
uv sync --no-install-project --no-editable; \
|
| 48 |
-
fi
|
| 49 |
-
|
| 50 |
-
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 51 |
-
if [ -f uv.lock ]; then \
|
| 52 |
-
uv sync --frozen --no-editable; \
|
| 53 |
-
else \
|
| 54 |
-
uv sync --no-editable; \
|
| 55 |
-
fi
|
| 56 |
-
|
| 57 |
-
# Final runtime stage
|
| 58 |
-
FROM ${BASE_IMAGE}
|
| 59 |
|
| 60 |
WORKDIR /app
|
| 61 |
|
| 62 |
-
#
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
-
# Copy
|
| 66 |
-
COPY
|
| 67 |
|
| 68 |
-
#
|
| 69 |
-
|
| 70 |
|
| 71 |
-
# Set
|
| 72 |
-
ENV
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
# Health check
|
| 75 |
-
HEALTHCHECK --interval=30s --timeout=
|
| 76 |
-
CMD curl -f http://localhost:
|
| 77 |
|
| 78 |
-
# Run
|
| 79 |
-
|
| 80 |
ENV ENABLE_WEB_INTERFACE=true
|
| 81 |
-
CMD ["
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
WORKDIR /app
|
| 4 |
|
| 5 |
+
# Install system dependencies
|
| 6 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 7 |
+
gcc \
|
| 8 |
+
git \
|
| 9 |
+
curl \
|
| 10 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 11 |
|
| 12 |
+
# Copy source code
|
| 13 |
+
COPY . /app
|
| 14 |
|
| 15 |
+
# Install Python dependencies
|
| 16 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 17 |
|
| 18 |
+
# Set environment variables
|
| 19 |
+
ENV PYTHONUNBUFFERED=1
|
| 20 |
+
ENV HOST=0.0.0.0
|
| 21 |
+
ENV PORT=8000
|
| 22 |
+
ENV WORKERS=1
|
| 23 |
+
ENV MAX_CONCURRENT_ENVS=16
|
| 24 |
|
| 25 |
# Health check
|
| 26 |
+
HEALTHCHECK --interval=30s --timeout=5s --start-period=15s --retries=3 \
|
| 27 |
+
CMD curl -f http://localhost:${PORT}/health || exit 1
|
| 28 |
|
| 29 |
+
# Run FastAPI app
|
| 30 |
+
EXPOSE ${PORT}
|
| 31 |
ENV ENABLE_WEB_INTERFACE=true
|
| 32 |
+
CMD ["python", "-m", "server.app"]
|
Project.md
ADDED
|
@@ -0,0 +1,1111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
python inference.py --model gpt-3.5-turbo --base-url "http://localhost:8000/v1"
|
| 2 |
+
python inference.py --model gemini-2.0-flash --base-url "https://generativelanguage.googleapis.com/openai/"
|
| 3 |
+
python inference.py --model deepseek-chat --base-url "https://api.deepseek.com"# Python Env Project Guide
|
| 4 |
+
|
| 5 |
+
This document explains how to work with the `python_env` project end to end:
|
| 6 |
+
|
| 7 |
+
1. What the environment is trying to do
|
| 8 |
+
2. How the current code is structured
|
| 9 |
+
3. How each route works
|
| 10 |
+
4. How to test each route manually
|
| 11 |
+
5. How to use the inference script
|
| 12 |
+
6. How to prepare data so an RL or agent-training setup can learn more effectively
|
| 13 |
+
7. How the project maps to the hackathon functional requirements
|
| 14 |
+
|
| 15 |
+
The goal is practical: after reading this file, you should be able to start the server, hit every route, understand what each response means, run the baseline, and know what data to collect next.
|
| 16 |
+
|
| 17 |
+
## 1. Project Goal
|
| 18 |
+
|
| 19 |
+
This environment simulates a real software engineering workflow: Python code review.
|
| 20 |
+
|
| 21 |
+
An agent is given Python code and must:
|
| 22 |
+
|
| 23 |
+
- detect correctness bugs
|
| 24 |
+
- detect security risks
|
| 25 |
+
- detect maintainability problems
|
| 26 |
+
- detect obvious performance issues
|
| 27 |
+
- optionally suggest improved code
|
| 28 |
+
|
| 29 |
+
This is a valid real-world environment because code review is an actual human task used in engineering teams every day.
|
| 30 |
+
|
| 31 |
+
## 2. High-Level Architecture
|
| 32 |
+
|
| 33 |
+
The project has four main parts:
|
| 34 |
+
|
| 35 |
+
- `models.py`
|
| 36 |
+
Defines the typed Pydantic models for actions, observations, evaluations, config, health, and direct-review payloads.
|
| 37 |
+
|
| 38 |
+
- `server/code_review_environment.py`
|
| 39 |
+
Implements the environment logic: `reset()`, `step()`, reward shaping, task progression, hints, history, and grading integration.
|
| 40 |
+
|
| 41 |
+
- `server/task_bank.py`, `server/grading.py`, `server/static_review.py`
|
| 42 |
+
These files define the benchmark tasks, deterministic graders, and direct static review rules.
|
| 43 |
+
|
| 44 |
+
- `server/app.py`
|
| 45 |
+
Exposes both:
|
| 46 |
+
- OpenEnv-compatible endpoints such as `/reset`, `/step`, `/state`, `/schema`, `/ws`
|
| 47 |
+
- custom REST endpoints such as `/health`, `/tasks`, `/review`, `/config`, `/history`
|
| 48 |
+
|
| 49 |
+
- `inference.py`
|
| 50 |
+
Runs an OpenAI-compatible model against the environment and writes a reproducible report.
|
| 51 |
+
|
| 52 |
+
## 3. File-by-File Understanding
|
| 53 |
+
|
| 54 |
+
### `models.py`
|
| 55 |
+
|
| 56 |
+
Important models:
|
| 57 |
+
|
| 58 |
+
- `ReviewFinding`
|
| 59 |
+
One code-review issue found by the agent.
|
| 60 |
+
Fields:
|
| 61 |
+
- `title`
|
| 62 |
+
- `line`
|
| 63 |
+
- `category`
|
| 64 |
+
- `severity`
|
| 65 |
+
- `rationale`
|
| 66 |
+
- `recommendation`
|
| 67 |
+
- `rule_id`
|
| 68 |
+
|
| 69 |
+
- `PythonReviewAction`
|
| 70 |
+
What the agent sends to the environment.
|
| 71 |
+
Fields:
|
| 72 |
+
- `operation`
|
| 73 |
+
- `findings`
|
| 74 |
+
- `patched_code`
|
| 75 |
+
- `note`
|
| 76 |
+
|
| 77 |
+
- `PythonReviewObservation`
|
| 78 |
+
What the environment returns back.
|
| 79 |
+
Fields:
|
| 80 |
+
- `task`
|
| 81 |
+
- `instructions`
|
| 82 |
+
- `feedback`
|
| 83 |
+
- `submitted_findings`
|
| 84 |
+
- `hints_used`
|
| 85 |
+
- `attempts_remaining`
|
| 86 |
+
- `evaluation`
|
| 87 |
+
- `score`
|
| 88 |
+
- `review_time_ms`
|
| 89 |
+
- inherited OpenEnv fields such as `reward`, `done`, `metadata`
|
| 90 |
+
|
| 91 |
+
- `TaskEvaluation`
|
| 92 |
+
Deterministic grading output.
|
| 93 |
+
Fields:
|
| 94 |
+
- `matched_reference_ids`
|
| 95 |
+
- `matched_findings`
|
| 96 |
+
- `total_findings`
|
| 97 |
+
- `false_positives`
|
| 98 |
+
- `duplicate_findings`
|
| 99 |
+
- `weighted_recall`
|
| 100 |
+
- `patch_score`
|
| 101 |
+
- `score`
|
| 102 |
+
- `passed`
|
| 103 |
+
|
| 104 |
+
### `server/task_bank.py`
|
| 105 |
+
|
| 106 |
+
Contains the benchmark tasks.
|
| 107 |
+
|
| 108 |
+
Current tasks:
|
| 109 |
+
|
| 110 |
+
1. `py-review-easy`
|
| 111 |
+
Detect unsafe `eval` and division-by-zero risk.
|
| 112 |
+
|
| 113 |
+
2. `py-review-medium`
|
| 114 |
+
Detect mutable default list, quadratic membership check, and bare `except`.
|
| 115 |
+
|
| 116 |
+
3. `py-review-hard`
|
| 117 |
+
Detect `shell=True` command injection, stale cache bug, and shared output file risk.
|
| 118 |
+
|
| 119 |
+
Each task contains:
|
| 120 |
+
|
| 121 |
+
- code to review
|
| 122 |
+
- hints
|
| 123 |
+
- reference findings
|
| 124 |
+
- pass threshold
|
| 125 |
+
|
| 126 |
+
### `server/grading.py`
|
| 127 |
+
|
| 128 |
+
This is the benchmark grader.
|
| 129 |
+
|
| 130 |
+
It compares submitted findings to hidden reference findings and computes:
|
| 131 |
+
|
| 132 |
+
- weighted recall
|
| 133 |
+
- penalties for false positives
|
| 134 |
+
- penalties for duplicates
|
| 135 |
+
- optional patch quality score
|
| 136 |
+
- final score in `0.0` to `1.0`
|
| 137 |
+
|
| 138 |
+
This makes the task deterministic and reproducible, which is important for hackathon judging.
|
| 139 |
+
|
| 140 |
+
### `server/static_review.py`
|
| 141 |
+
|
| 142 |
+
This powers the `/review` endpoint for arbitrary code snippets.
|
| 143 |
+
|
| 144 |
+
It uses AST inspection to detect:
|
| 145 |
+
|
| 146 |
+
- `eval` / `exec`
|
| 147 |
+
- mutable default arguments
|
| 148 |
+
- `shell=True`
|
| 149 |
+
- bare `except`
|
| 150 |
+
- list-membership-inside-loop performance smell
|
| 151 |
+
- syntax errors
|
| 152 |
+
- `print()` used in application logic
|
| 153 |
+
|
| 154 |
+
This is not the task grader. It is the direct-review helper.
|
| 155 |
+
|
| 156 |
+
### Reward System
|
| 157 |
+
|
| 158 |
+
The reward system is **dynamic and multi-component**, designed to provide meaningful feedback at every step of the agent's learning process.
|
| 159 |
+
|
| 160 |
+
#### Reward Architecture
|
| 161 |
+
|
| 162 |
+
The system computes rewards using **6 independent components**:
|
| 163 |
+
|
| 164 |
+
1. **Progress Reward** (max +0.25)
|
| 165 |
+
- Awarded when the agent improves the score from one step to the next
|
| 166 |
+
- Formula: `min(PROGRESS_SCALE * score_delta, 0.25)`
|
| 167 |
+
- Encourages continuous improvement
|
| 168 |
+
|
| 169 |
+
2. **Syntax Reward** (max +0.35)
|
| 170 |
+
- One-time bonus awarded for fixing syntax errors (first time compiling)
|
| 171 |
+
- Applied once per episode when code transitions from uncompilable to compilable
|
| 172 |
+
- Acknowledges the critical first step of making code valid
|
| 173 |
+
|
| 174 |
+
3. **Test Reward** (max +0.20)
|
| 175 |
+
- Based on improvement in test pass rate
|
| 176 |
+
- Computed as: `min(TEST_PASS_REWARD_SCALE * test_improvement_fraction, 0.20)`
|
| 177 |
+
- Rewards incremental progress on passing more tests
|
| 178 |
+
|
| 179 |
+
4. **Quality Reward** (max +0.15)
|
| 180 |
+
- Based on AST-detected code quality metrics
|
| 181 |
+
- Rewards improvements in code structure, readability, and best practices
|
| 182 |
+
- Uses deterministic grader feedback
|
| 183 |
+
|
| 184 |
+
5. **Stagnation Penalty** (−0.10)
|
| 185 |
+
- Applied when the agent takes action but code doesn't change
|
| 186 |
+
- Encourages the agent to edit the code rather than analyze repeatedly
|
| 187 |
+
- Configurable via `STAGNATION_PENALTY` constant
|
| 188 |
+
|
| 189 |
+
6. **Regression Penalty** (scale −0.20)
|
| 190 |
+
- Applied when score decreases from previous step
|
| 191 |
+
- Formula: `REGRESSION_PENALTY_SCALE * abs(score_delta)`
|
| 192 |
+
- Discourages actions that make code worse
|
| 193 |
+
|
| 194 |
+
#### Reward Constants
|
| 195 |
+
|
| 196 |
+
Defined at the top of `server/env.py`:
|
| 197 |
+
|
| 198 |
+
```python
|
| 199 |
+
SYNTAX_FIX_BONUS = 0.35 # One-time syntax reward
|
| 200 |
+
TEST_PASS_REWARD_SCALE = 0.30 # Per test improvement
|
| 201 |
+
QUALITY_BONUS_SCALE = 0.15 # Code quality improvement
|
| 202 |
+
PROGRESS_SCALE = 0.25 # Score improvement
|
| 203 |
+
COMPLETION_BONUS = 0.50 # Full correctness bonus
|
| 204 |
+
INVALID_ACTION_PENALTY = 0.15 # For unsupported actions
|
| 205 |
+
STAGNATION_PENALTY = 0.10 # For unchanged code
|
| 206 |
+
REGRESSION_PENALTY_SCALE = 0.20 # For score decline
|
| 207 |
+
TIMEOUT_PENALTY = 0.15 # For execution timeout
|
| 208 |
+
```
|
| 209 |
+
|
| 210 |
+
#### Final Reward Computation
|
| 211 |
+
|
| 212 |
+
The final reward is:
|
| 213 |
+
|
| 214 |
+
```
|
| 215 |
+
total = progress + syntax + test + quality - stagnation - regression
|
| 216 |
+
final_reward = clamp(total, -1.0, +1.0)
|
| 217 |
+
```
|
| 218 |
+
|
| 219 |
+
The result is always between −1.0 and +1.0, providing bounded, interpretable feedback.
|
| 220 |
+
|
| 221 |
+
#### RewardDetails: Transparent Feedback
|
| 222 |
+
|
| 223 |
+
Every reward is returned as a `RewardDetails` object with these fields:
|
| 224 |
+
|
| 225 |
+
- `value`: The scalar reward for this step
|
| 226 |
+
- `syntax_reward`: Contribution from syntax fixes
|
| 227 |
+
- `test_reward`: Contribution from test improvements
|
| 228 |
+
- `quality_bonus`: Contribution from code quality
|
| 229 |
+
- `progress_delta`: Contribution from score improvement
|
| 230 |
+
- `stagnation_penalty`: Penalty for unchanged code
|
| 231 |
+
- `regression_penalty`: Penalty for score decline
|
| 232 |
+
- `prev_score` / `curr_score`: Score before and after the action
|
| 233 |
+
- `code_changed`: Whether the action modified the code
|
| 234 |
+
- `reason`: Human-readable explanation of the reward
|
| 235 |
+
|
| 236 |
+
This transparency is crucial for:
|
| 237 |
+
- Debugging agent behavior
|
| 238 |
+
- Understanding what drives reward
|
| 239 |
+
- Tuning the constants
|
| 240 |
+
- Training supervised models on reward components
|
| 241 |
+
|
| 242 |
+
#### Why This Design Helps Agents Learn
|
| 243 |
+
|
| 244 |
+
1. **Non-Constant**: Different actions produce different rewards, enabling meaningful gradient signals
|
| 245 |
+
2. **Progressive**: Early bonuses (syntax) are high; later improvements are smaller, promoting efficiency
|
| 246 |
+
3. **Transparent**: Detailed component breakdown helps agents understand what matters
|
| 247 |
+
4. **Bounded**: Clamping to [−1, 1] prevents reward hacking and explosion
|
| 248 |
+
5. **Balanced**: Positive and negative signals teach precision and recall together
|
| 249 |
+
|
| 250 |
+
### `server/code_review_environment.py`
|
| 251 |
+
|
| 252 |
+
This is the environment core.
|
| 253 |
+
|
| 254 |
+
Main methods:
|
| 255 |
+
|
| 256 |
+
- `reset()`
|
| 257 |
+
Rotates to the next task, resets episode state, and returns the initial observation.
|
| 258 |
+
|
| 259 |
+
- `step(action)`
|
| 260 |
+
Accepts a `PythonReviewAction`, grades it, shapes reward, updates history, and returns the new observation.
|
| 261 |
+
|
| 262 |
+
- `direct_review(code, context)`
|
| 263 |
+
Calls the static reviewer for arbitrary code.
|
| 264 |
+
|
| 265 |
+
- `list_tasks()`
|
| 266 |
+
Returns public descriptors for all tasks.
|
| 267 |
+
|
| 268 |
+
- `grade_task_submission(task_id, findings, patched_code)`
|
| 269 |
+
Grades a proposed submission against the deterministic rubric without stepping through an episode.
|
| 270 |
+
|
| 271 |
+
### `server/app.py`
|
| 272 |
+
|
| 273 |
+
This file wires everything to FastAPI and OpenEnv.
|
| 274 |
+
|
| 275 |
+
Important note:
|
| 276 |
+
|
| 277 |
+
- OpenEnv endpoints are managed through `create_app(PythonEnvironment, PythonReviewAction, PythonReviewObservation)`
|
| 278 |
+
- custom routes such as `/health`, `/tasks`, `/review`, `/history`, `/config` use a singleton `python_env`
|
| 279 |
+
|
| 280 |
+
That means:
|
| 281 |
+
|
| 282 |
+
- `/reset` and `/step` are served by OpenEnv session handling
|
| 283 |
+
- `/review`, `/tasks`, `/config`, `/history` are served by the singleton helper instance
|
| 284 |
+
|
| 285 |
+
This is fine for startup and manual testing, but if you want one fully unified state model later, you should refactor custom routes to read from the same managed environment/session layer.
|
| 286 |
+
|
| 287 |
+
## 4. Route-by-Route Guide
|
| 288 |
+
|
| 289 |
+
### OpenEnv Routes
|
| 290 |
+
|
| 291 |
+
These are important for validation and agents.
|
| 292 |
+
|
| 293 |
+
#### `POST /reset`
|
| 294 |
+
|
| 295 |
+
Purpose:
|
| 296 |
+
- starts a new episode
|
| 297 |
+
- rotates to the next benchmark task
|
| 298 |
+
- returns an initial observation
|
| 299 |
+
|
| 300 |
+
Use this when:
|
| 301 |
+
- you want to start evaluating an agent on a task
|
| 302 |
+
|
| 303 |
+
#### `POST /step`
|
| 304 |
+
|
| 305 |
+
Purpose:
|
| 306 |
+
- submit agent actions
|
| 307 |
+
- get reward, observation, and done flag
|
| 308 |
+
|
| 309 |
+
Use this when:
|
| 310 |
+
- manually simulating agent steps
|
| 311 |
+
- testing reward shaping and grading
|
| 312 |
+
|
| 313 |
+
#### `GET /state`
|
| 314 |
+
|
| 315 |
+
Purpose:
|
| 316 |
+
- returns current OpenEnv session state, typically `episode_id` and `step_count`
|
| 317 |
+
|
| 318 |
+
Use this when:
|
| 319 |
+
- debugging session behavior
|
| 320 |
+
|
| 321 |
+
#### `GET /schema`
|
| 322 |
+
|
| 323 |
+
Purpose:
|
| 324 |
+
- shows the action/observation schema expected by OpenEnv
|
| 325 |
+
|
| 326 |
+
Use this when:
|
| 327 |
+
- debugging payload formats
|
| 328 |
+
- verifying OpenEnv compatibility
|
| 329 |
+
|
| 330 |
+
#### `WS /ws`
|
| 331 |
+
|
| 332 |
+
Purpose:
|
| 333 |
+
- persistent lower-latency session transport for clients
|
| 334 |
+
|
| 335 |
+
Use this when:
|
| 336 |
+
- building actual agent loops with the `EnvClient`
|
| 337 |
+
|
| 338 |
+
### Custom REST Routes
|
| 339 |
+
|
| 340 |
+
#### `GET /health`
|
| 341 |
+
|
| 342 |
+
Purpose:
|
| 343 |
+
- quick health check for Docker and Hugging Face Spaces
|
| 344 |
+
|
| 345 |
+
Use this when:
|
| 346 |
+
- checking whether the server is alive
|
| 347 |
+
- validating deployment health
|
| 348 |
+
|
| 349 |
+
#### `GET /tasks`
|
| 350 |
+
|
| 351 |
+
Purpose:
|
| 352 |
+
- returns the three benchmark task descriptors
|
| 353 |
+
|
| 354 |
+
Use this when:
|
| 355 |
+
- reviewing available tasks
|
| 356 |
+
- building curriculum/eval metadata
|
| 357 |
+
|
| 358 |
+
#### `GET /tasks/{task_id}`
|
| 359 |
+
|
| 360 |
+
Purpose:
|
| 361 |
+
- returns one task descriptor
|
| 362 |
+
|
| 363 |
+
Use this when:
|
| 364 |
+
- inspecting a task before submitting findings
|
| 365 |
+
|
| 366 |
+
#### `POST /tasks/{task_id}/grade`
|
| 367 |
+
|
| 368 |
+
Purpose:
|
| 369 |
+
- grade a proposed set of findings against the deterministic task rubric
|
| 370 |
+
|
| 371 |
+
Use this when:
|
| 372 |
+
- validating benchmark grading directly
|
| 373 |
+
- building offline evaluation sets
|
| 374 |
+
|
| 375 |
+
#### `POST /review`
|
| 376 |
+
|
| 377 |
+
Purpose:
|
| 378 |
+
- run direct static review on arbitrary Python code
|
| 379 |
+
|
| 380 |
+
Use this when:
|
| 381 |
+
- testing the static analyzer
|
| 382 |
+
- building training examples
|
| 383 |
+
- verifying that common issues are caught
|
| 384 |
+
|
| 385 |
+
#### `GET /history`
|
| 386 |
+
|
| 387 |
+
Purpose:
|
| 388 |
+
- returns the singleton environment history
|
| 389 |
+
|
| 390 |
+
Use this when:
|
| 391 |
+
- checking what the custom singleton environment has processed
|
| 392 |
+
|
| 393 |
+
Note:
|
| 394 |
+
- this history is not the same as OpenEnv session history from `/step`
|
| 395 |
+
|
| 396 |
+
#### `DELETE /history`
|
| 397 |
+
|
| 398 |
+
Purpose:
|
| 399 |
+
- clears the singleton history
|
| 400 |
+
|
| 401 |
+
Use this when:
|
| 402 |
+
- resetting the custom review log before a test run
|
| 403 |
+
|
| 404 |
+
#### `GET /config`
|
| 405 |
+
|
| 406 |
+
Purpose:
|
| 407 |
+
- inspect config values such as penalties and task order
|
| 408 |
+
|
| 409 |
+
#### `PUT /config`
|
| 410 |
+
|
| 411 |
+
Purpose:
|
| 412 |
+
- update the environment config
|
| 413 |
+
|
| 414 |
+
Use this when:
|
| 415 |
+
- testing different reward penalties or task order
|
| 416 |
+
|
| 417 |
+
## 5. Manual Testing: Step by Step
|
| 418 |
+
|
| 419 |
+
Start the server:
|
| 420 |
+
|
| 421 |
+
```powershell
|
| 422 |
+
uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
|
| 423 |
+
```
|
| 424 |
+
|
| 425 |
+
Open the docs:
|
| 426 |
+
|
| 427 |
+
```text
|
| 428 |
+
http://127.0.0.1:8000/docs
|
| 429 |
+
```
|
| 430 |
+
|
| 431 |
+
That is the easiest manual route explorer.
|
| 432 |
+
|
| 433 |
+
### Test 1: Health
|
| 434 |
+
|
| 435 |
+
```powershell
|
| 436 |
+
Invoke-RestMethod -Uri "http://127.0.0.1:8000/health" -Method Get
|
| 437 |
+
```
|
| 438 |
+
|
| 439 |
+
Expected:
|
| 440 |
+
- `status` should be `ok`
|
| 441 |
+
- `task_count` should be `3`
|
| 442 |
+
|
| 443 |
+
### Test 2: List Tasks
|
| 444 |
+
|
| 445 |
+
```powershell
|
| 446 |
+
Invoke-RestMethod -Uri "http://127.0.0.1:8000/tasks" -Method Get
|
| 447 |
+
```
|
| 448 |
+
|
| 449 |
+
Expected:
|
| 450 |
+
- three tasks
|
| 451 |
+
- each task has `task_id`, `difficulty`, `title`, `objective`, `code`
|
| 452 |
+
|
| 453 |
+
### Test 3: Get One Task
|
| 454 |
+
|
| 455 |
+
```powershell
|
| 456 |
+
Invoke-RestMethod -Uri "http://127.0.0.1:8000/tasks/py-review-easy" -Method Get
|
| 457 |
+
```
|
| 458 |
+
|
| 459 |
+
### Test 4: Direct Static Review
|
| 460 |
+
|
| 461 |
+
```powershell
|
| 462 |
+
$body = @{
|
| 463 |
+
code = @"
|
| 464 |
+
def load_settings(config_text):
|
| 465 |
+
return eval(config_text)
|
| 466 |
+
"@
|
| 467 |
+
} | ConvertTo-Json
|
| 468 |
+
|
| 469 |
+
Invoke-RestMethod -Uri "http://127.0.0.1:8000/review" `
|
| 470 |
+
-Method Post `
|
| 471 |
+
-Body $body `
|
| 472 |
+
-ContentType "application/json"
|
| 473 |
+
```
|
| 474 |
+
|
| 475 |
+
Expected:
|
| 476 |
+
- at least one issue
|
| 477 |
+
- one issue should have `rule_id = "avoid-eval"`
|
| 478 |
+
|
| 479 |
+
### Test 5: Reset Episode
|
| 480 |
+
|
| 481 |
+
```powershell
|
| 482 |
+
Invoke-RestMethod -Uri "http://127.0.0.1:8000/reset" `
|
| 483 |
+
-Method Post `
|
| 484 |
+
-Body "{}" `
|
| 485 |
+
-ContentType "application/json"
|
| 486 |
+
```
|
| 487 |
+
|
| 488 |
+
Expected:
|
| 489 |
+
- an observation with a `task`
|
| 490 |
+
- `done = false`
|
| 491 |
+
- `reward = 0`
|
| 492 |
+
|
| 493 |
+
### Test 6: Submit Partial Findings To `/step`
|
| 494 |
+
|
| 495 |
+
```powershell
|
| 496 |
+
$body = @{
|
| 497 |
+
operation = "submit_findings"
|
| 498 |
+
findings = @(
|
| 499 |
+
@{
|
| 500 |
+
title = "Avoid eval on untrusted configuration data"
|
| 501 |
+
line = 2
|
| 502 |
+
category = "security"
|
| 503 |
+
severity = "critical"
|
| 504 |
+
rationale = "eval can execute attacker-controlled code."
|
| 505 |
+
recommendation = "Use json.loads or ast.literal_eval."
|
| 506 |
+
rule_id = "avoid-eval"
|
| 507 |
+
}
|
| 508 |
+
)
|
| 509 |
+
patched_code = $null
|
| 510 |
+
note = "First pass review"
|
| 511 |
+
} | ConvertTo-Json -Depth 5
|
| 512 |
+
|
| 513 |
+
Invoke-RestMethod -Uri "http://127.0.0.1:8000/step" `
|
| 514 |
+
-Method Post `
|
| 515 |
+
-Body $body `
|
| 516 |
+
-ContentType "application/json"
|
| 517 |
+
```
|
| 518 |
+
|
| 519 |
+
Expected:
|
| 520 |
+
- positive reward
|
| 521 |
+
- improved `score`
|
| 522 |
+
- feedback mentioning a matched rubric item
|
| 523 |
+
|
| 524 |
+
### Test 7: Request A Hint
|
| 525 |
+
|
| 526 |
+
```powershell
|
| 527 |
+
$body = @{
|
| 528 |
+
operation = "request_hint"
|
| 529 |
+
findings = @()
|
| 530 |
+
patched_code = $null
|
| 531 |
+
note = "Need help"
|
| 532 |
+
} | ConvertTo-Json -Depth 5
|
| 533 |
+
|
| 534 |
+
Invoke-RestMethod -Uri "http://127.0.0.1:8000/step" `
|
| 535 |
+
-Method Post `
|
| 536 |
+
-Body $body `
|
| 537 |
+
-ContentType "application/json"
|
| 538 |
+
```
|
| 539 |
+
|
| 540 |
+
Expected:
|
| 541 |
+
- small negative reward
|
| 542 |
+
- feedback containing `Hint 1: ...`
|
| 543 |
+
|
| 544 |
+
### Test 8: Finalize A Full Submission
|
| 545 |
+
|
| 546 |
+
```powershell
|
| 547 |
+
$body = @{
|
| 548 |
+
operation = "finalize"
|
| 549 |
+
findings = @(
|
| 550 |
+
@{
|
| 551 |
+
title = "Avoid eval on untrusted configuration data"
|
| 552 |
+
line = 2
|
| 553 |
+
category = "security"
|
| 554 |
+
severity = "critical"
|
| 555 |
+
rationale = "eval can execute attacker-controlled code."
|
| 556 |
+
recommendation = "Use json.loads or ast.literal_eval."
|
| 557 |
+
rule_id = "avoid-eval"
|
| 558 |
+
},
|
| 559 |
+
@{
|
| 560 |
+
title = "Default count of zero causes a division by zero"
|
| 561 |
+
line = 5
|
| 562 |
+
category = "bug"
|
| 563 |
+
severity = "warning"
|
| 564 |
+
rationale = "count defaults to zero and division crashes."
|
| 565 |
+
recommendation = "Validate count before dividing."
|
| 566 |
+
rule_id = "division-by-zero-default"
|
| 567 |
+
}
|
| 568 |
+
)
|
| 569 |
+
patched_code = $null
|
| 570 |
+
note = "Final review"
|
| 571 |
+
} | ConvertTo-Json -Depth 6
|
| 572 |
+
|
| 573 |
+
Invoke-RestMethod -Uri "http://127.0.0.1:8000/step" `
|
| 574 |
+
-Method Post `
|
| 575 |
+
-Body $body `
|
| 576 |
+
-ContentType "application/json"
|
| 577 |
+
```
|
| 578 |
+
|
| 579 |
+
Expected:
|
| 580 |
+
- `done = true`
|
| 581 |
+
- `evaluation.passed = true`
|
| 582 |
+
- `score` near or above task threshold
|
| 583 |
+
|
| 584 |
+
### Test 9: Inspect State
|
| 585 |
+
|
| 586 |
+
```powershell
|
| 587 |
+
Invoke-RestMethod -Uri "http://127.0.0.1:8000/state" -Method Get
|
| 588 |
+
```
|
| 589 |
+
|
| 590 |
+
### Test 10: Inspect Schemas
|
| 591 |
+
|
| 592 |
+
```powershell
|
| 593 |
+
Invoke-RestMethod -Uri "http://127.0.0.1:8000/schema" -Method Get
|
| 594 |
+
```
|
| 595 |
+
|
| 596 |
+
### Test 11: Grade A Task Without Running An Episode
|
| 597 |
+
|
| 598 |
+
```powershell
|
| 599 |
+
$body = @{
|
| 600 |
+
operation = "submit_findings"
|
| 601 |
+
findings = @(
|
| 602 |
+
@{
|
| 603 |
+
title = "shell=True with interpolated input allows command injection"
|
| 604 |
+
line = 10
|
| 605 |
+
category = "security"
|
| 606 |
+
severity = "critical"
|
| 607 |
+
rationale = "The command string includes user input and runs via shell."
|
| 608 |
+
recommendation = "Pass args as a list and keep shell=False."
|
| 609 |
+
rule_id = "shell-true-command-injection"
|
| 610 |
+
}
|
| 611 |
+
)
|
| 612 |
+
patched_code = $null
|
| 613 |
+
note = "Offline grader test"
|
| 614 |
+
} | ConvertTo-Json -Depth 6
|
| 615 |
+
|
| 616 |
+
Invoke-RestMethod -Uri "http://127.0.0.1:8000/tasks/py-review-hard/grade" `
|
| 617 |
+
-Method Post `
|
| 618 |
+
-Body $body `
|
| 619 |
+
-ContentType "application/json"
|
| 620 |
+
```
|
| 621 |
+
|
| 622 |
+
### Test 12: Config Read And Update
|
| 623 |
+
|
| 624 |
+
Read:
|
| 625 |
+
|
| 626 |
+
```powershell
|
| 627 |
+
Invoke-RestMethod -Uri "http://127.0.0.1:8000/config" -Method Get
|
| 628 |
+
```
|
| 629 |
+
|
| 630 |
+
Update:
|
| 631 |
+
|
| 632 |
+
```powershell
|
| 633 |
+
$body = @{
|
| 634 |
+
task_order = @("py-review-easy", "py-review-medium", "py-review-hard")
|
| 635 |
+
max_steps_per_task = 4
|
| 636 |
+
hint_penalty = 0.05
|
| 637 |
+
false_positive_penalty = 0.08
|
| 638 |
+
duplicate_penalty = 0.03
|
| 639 |
+
patch_bonus_multiplier = 0.2
|
| 640 |
+
max_history_entries = 50
|
| 641 |
+
} | ConvertTo-Json
|
| 642 |
+
|
| 643 |
+
Invoke-RestMethod -Uri "http://127.0.0.1:8000/config" `
|
| 644 |
+
-Method Put `
|
| 645 |
+
-Body $body `
|
| 646 |
+
-ContentType "application/json"
|
| 647 |
+
```
|
| 648 |
+
|
| 649 |
+
### Test 13: History
|
| 650 |
+
|
| 651 |
+
```powershell
|
| 652 |
+
Invoke-RestMethod -Uri "http://127.0.0.1:8000/history" -Method Get
|
| 653 |
+
```
|
| 654 |
+
|
| 655 |
+
Clear:
|
| 656 |
+
|
| 657 |
+
```powershell
|
| 658 |
+
Invoke-RestMethod -Uri "http://127.0.0.1:8000/history" -Method Delete
|
| 659 |
+
```
|
| 660 |
+
|
| 661 |
+
## 6. How To Test Using The Inference Script
|
| 662 |
+
|
| 663 |
+
The inference script is for model-vs-environment evaluation.
|
| 664 |
+
|
| 665 |
+
### Required Variables
|
| 666 |
+
|
| 667 |
+
```powershell
|
| 668 |
+
$env:API_BASE_URL="https://api.openai.com/v1"
|
| 669 |
+
$env:MODEL_NAME="gpt-4.1-mini"
|
| 670 |
+
$env:OPENAI_API_KEY="your_key_here"
|
| 671 |
+
```
|
| 672 |
+
|
| 673 |
+
If you want it to hit your local server instead of launching Docker:
|
| 674 |
+
|
| 675 |
+
```powershell
|
| 676 |
+
$env:ENV_BASE_URL="http://127.0.0.1:8000"
|
| 677 |
+
```
|
| 678 |
+
|
| 679 |
+
Optional:
|
| 680 |
+
|
| 681 |
+
```powershell
|
| 682 |
+
$env:MAX_TASKS="3"
|
| 683 |
+
$env:MAX_STEPS="3"
|
| 684 |
+
$env:INFERENCE_REPORT_PATH="inference_results.json"
|
| 685 |
+
```
|
| 686 |
+
|
| 687 |
+
Run:
|
| 688 |
+
|
| 689 |
+
```powershell
|
| 690 |
+
python inference.py
|
| 691 |
+
```
|
| 692 |
+
|
| 693 |
+
What it does:
|
| 694 |
+
|
| 695 |
+
1. connects to the environment
|
| 696 |
+
2. resets through up to 3 tasks
|
| 697 |
+
3. sends task code and feedback to the model
|
| 698 |
+
4. expects strict JSON findings back
|
| 699 |
+
5. submits them through `step()`
|
| 700 |
+
6. logs score and reward per step
|
| 701 |
+
7. writes a final report JSON file
|
| 702 |
+
|
| 703 |
+
### How To Interpret The Output
|
| 704 |
+
|
| 705 |
+
Focus on:
|
| 706 |
+
|
| 707 |
+
- `mean_score`
|
| 708 |
+
Overall average benchmark score
|
| 709 |
+
|
| 710 |
+
- per-task `score`
|
| 711 |
+
How well the model solved each task
|
| 712 |
+
|
| 713 |
+
- `passed`
|
| 714 |
+
Whether score met that task’s threshold
|
| 715 |
+
|
| 716 |
+
- step logs
|
| 717 |
+
Show whether the model is improving over trajectory or getting stuck
|
| 718 |
+
|
| 719 |
+
If the model keeps returning empty findings:
|
| 720 |
+
|
| 721 |
+
- improve the system prompt
|
| 722 |
+
- reduce task ambiguity
|
| 723 |
+
- add examples of desired findings
|
| 724 |
+
- ensure the model endpoint supports the chosen format well
|
| 725 |
+
|
| 726 |
+
## 7. How To Build Better Training Data
|
| 727 |
+
|
| 728 |
+
If you want an RL environment to actually learn, the biggest bottleneck is data quality.
|
| 729 |
+
|
| 730 |
+
You need more than just three final benchmark tasks. You need trajectories, partial attempts, and failure examples.
|
| 731 |
+
|
| 732 |
+
### Data Types You Should Collect
|
| 733 |
+
|
| 734 |
+
#### A. Gold Task Rubrics
|
| 735 |
+
|
| 736 |
+
For each task, store:
|
| 737 |
+
|
| 738 |
+
- code snippet
|
| 739 |
+
- hidden reference findings
|
| 740 |
+
- severity
|
| 741 |
+
- category
|
| 742 |
+
- expected line numbers
|
| 743 |
+
- good recommendations
|
| 744 |
+
|
| 745 |
+
This is already partially represented by `server/task_bank.py`.
|
| 746 |
+
|
| 747 |
+
#### B. Positive Demonstrations
|
| 748 |
+
|
| 749 |
+
Create solved examples where the review is high quality.
|
| 750 |
+
|
| 751 |
+
Each example should include:
|
| 752 |
+
|
| 753 |
+
- task code
|
| 754 |
+
- one or more strong findings
|
| 755 |
+
- strong rationales
|
| 756 |
+
- strong recommendations
|
| 757 |
+
- optional patch
|
| 758 |
+
- final score
|
| 759 |
+
|
| 760 |
+
This helps supervised warm-start and behavior cloning.
|
| 761 |
+
|
| 762 |
+
#### C. Partial Trajectories
|
| 763 |
+
|
| 764 |
+
This is important for RL.
|
| 765 |
+
|
| 766 |
+
Store intermediate attempts like:
|
| 767 |
+
|
| 768 |
+
- first attempt finds one issue
|
| 769 |
+
- second attempt adds another issue
|
| 770 |
+
- third attempt finalizes
|
| 771 |
+
|
| 772 |
+
This is what teaches agents to improve over time, not just emit one final perfect answer.
|
| 773 |
+
|
| 774 |
+
#### D. Negative Examples
|
| 775 |
+
|
| 776 |
+
You should also store:
|
| 777 |
+
|
| 778 |
+
- false positives
|
| 779 |
+
- irrelevant complaints
|
| 780 |
+
- duplicate findings
|
| 781 |
+
- hallucinated issues
|
| 782 |
+
- weak recommendations
|
| 783 |
+
|
| 784 |
+
Why:
|
| 785 |
+
- the reward function penalizes these
|
| 786 |
+
- the model must learn precision, not just recall
|
| 787 |
+
|
| 788 |
+
#### E. Hint Usage Examples
|
| 789 |
+
|
| 790 |
+
Store trajectories where:
|
| 791 |
+
|
| 792 |
+
- the agent requests a hint
|
| 793 |
+
- then improves its findings
|
| 794 |
+
|
| 795 |
+
This teaches policy behavior around when hints are worth the penalty.
|
| 796 |
+
|
| 797 |
+
#### F. Patch Examples
|
| 798 |
+
|
| 799 |
+
For tasks where patch quality matters, store:
|
| 800 |
+
|
| 801 |
+
- original code
|
| 802 |
+
- weak patch
|
| 803 |
+
- good patch
|
| 804 |
+
- patch score
|
| 805 |
+
|
| 806 |
+
This helps the model learn that code edits should remove actual problems, not just change formatting.
|
| 807 |
+
|
| 808 |
+
## 8. Recommended Dataset Format
|
| 809 |
+
|
| 810 |
+
Use JSONL so it is easy to stream and train on.
|
| 811 |
+
|
| 812 |
+
### Benchmark Task Record
|
| 813 |
+
|
| 814 |
+
```json
|
| 815 |
+
{
|
| 816 |
+
"task_id": "py-review-easy",
|
| 817 |
+
"difficulty": "easy",
|
| 818 |
+
"code": "def load_settings(config_text):\n return eval(config_text)",
|
| 819 |
+
"reference_findings": [
|
| 820 |
+
{
|
| 821 |
+
"rule_id": "avoid-eval",
|
| 822 |
+
"line": 2,
|
| 823 |
+
"category": "security",
|
| 824 |
+
"severity": "critical"
|
| 825 |
+
}
|
| 826 |
+
]
|
| 827 |
+
}
|
| 828 |
+
```
|
| 829 |
+
|
| 830 |
+
### Trajectory Record
|
| 831 |
+
|
| 832 |
+
```json
|
| 833 |
+
{
|
| 834 |
+
"task_id": "py-review-medium",
|
| 835 |
+
"episode_id": "abc123",
|
| 836 |
+
"steps": [
|
| 837 |
+
{
|
| 838 |
+
"observation_feedback": "Review the Python snippet.",
|
| 839 |
+
"action": {
|
| 840 |
+
"operation": "submit_findings",
|
| 841 |
+
"findings": [
|
| 842 |
+
{
|
| 843 |
+
"title": "Mutable default argument leaks state",
|
| 844 |
+
"line": 1,
|
| 845 |
+
"category": "bug",
|
| 846 |
+
"severity": "warning"
|
| 847 |
+
}
|
| 848 |
+
]
|
| 849 |
+
},
|
| 850 |
+
"reward": 0.35,
|
| 851 |
+
"score": 0.35
|
| 852 |
+
},
|
| 853 |
+
{
|
| 854 |
+
"observation_feedback": "Matched 1 new rubric item(s): mutable-default-list",
|
| 855 |
+
"action": {
|
| 856 |
+
"operation": "finalize",
|
| 857 |
+
"findings": [
|
| 858 |
+
{
|
| 859 |
+
"title": "Mutable default argument leaks state",
|
| 860 |
+
"line": 1,
|
| 861 |
+
"category": "bug",
|
| 862 |
+
"severity": "warning"
|
| 863 |
+
},
|
| 864 |
+
{
|
| 865 |
+
"title": "Bare except hides failures",
|
| 866 |
+
"line": 12,
|
| 867 |
+
"category": "maintainability",
|
| 868 |
+
"severity": "warning"
|
| 869 |
+
}
|
| 870 |
+
]
|
| 871 |
+
},
|
| 872 |
+
"reward": 0.27,
|
| 873 |
+
"score": 0.62
|
| 874 |
+
}
|
| 875 |
+
]
|
| 876 |
+
}
|
| 877 |
+
```
|
| 878 |
+
|
| 879 |
+
## 9. How To Make RL Learn Better
|
| 880 |
+
|
| 881 |
+
### A. Add More Tasks
|
| 882 |
+
|
| 883 |
+
Three tasks are enough for the minimum requirement, but not enough for strong training.
|
| 884 |
+
|
| 885 |
+
You should expand with:
|
| 886 |
+
|
| 887 |
+
- file I/O bugs
|
| 888 |
+
- API misuse
|
| 889 |
+
- SQL injection
|
| 890 |
+
- unsafe deserialization
|
| 891 |
+
- concurrency issues
|
| 892 |
+
- caching mistakes
|
| 893 |
+
- resource leaks
|
| 894 |
+
- logic edge cases
|
| 895 |
+
|
| 896 |
+
Target:
|
| 897 |
+
|
| 898 |
+
- 50 to 200 deterministic tasks
|
| 899 |
+
- grouped by difficulty and domain
|
| 900 |
+
|
| 901 |
+
### B. Add More Partial Reward Signals
|
| 902 |
+
|
| 903 |
+
Current reward is already better than binary success/fail, but you can improve it.
|
| 904 |
+
|
| 905 |
+
Possible additions:
|
| 906 |
+
|
| 907 |
+
- small bonus when the first critical issue is found early
|
| 908 |
+
- higher reward for critical issues than style issues
|
| 909 |
+
- bonus when rationale quality is high
|
| 910 |
+
- bonus when recommendation mentions a correct mitigation pattern
|
| 911 |
+
- penalty if line numbers are missing when they should be known
|
| 912 |
+
|
| 913 |
+
### C. Improve Context In Observation
|
| 914 |
+
|
| 915 |
+
Right now the observation already gives:
|
| 916 |
+
|
| 917 |
+
- task metadata
|
| 918 |
+
- previous feedback
|
| 919 |
+
- submitted findings
|
| 920 |
+
- attempts remaining
|
| 921 |
+
|
| 922 |
+
You can improve learning further by including:
|
| 923 |
+
|
| 924 |
+
- a short list of matched findings so far
|
| 925 |
+
- a short list of remaining categories not yet covered
|
| 926 |
+
- normalized review rubric hints without leaking answers
|
| 927 |
+
- last action summary
|
| 928 |
+
|
| 929 |
+
This helps the agent reason about what it already did and what is still missing.
|
| 930 |
+
|
| 931 |
+
### D. Separate Training Tasks From Benchmark Tasks
|
| 932 |
+
|
| 933 |
+
Important:
|
| 934 |
+
|
| 935 |
+
- training tasks should be large and varied
|
| 936 |
+
- benchmark tasks should stay hidden and fixed
|
| 937 |
+
|
| 938 |
+
Do not train directly on the same exact benchmark set you plan to judge on.
|
| 939 |
+
|
| 940 |
+
### E. Add Preference Data
|
| 941 |
+
|
| 942 |
+
You can train preference models on:
|
| 943 |
+
|
| 944 |
+
- strong vs weak findings
|
| 945 |
+
- precise vs vague recommendations
|
| 946 |
+
- useful vs noisy patches
|
| 947 |
+
|
| 948 |
+
This is valuable for ranking quality beyond exact rubric matches.
|
| 949 |
+
|
| 950 |
+
## 10. Functional Requirements Mapping
|
| 951 |
+
|
| 952 |
+
Here is how your environment should be judged against the stated requirements.
|
| 953 |
+
|
| 954 |
+
### Requirement: Real-World Task Simulation
|
| 955 |
+
|
| 956 |
+
Status:
|
| 957 |
+
- satisfied in direction
|
| 958 |
+
|
| 959 |
+
Why:
|
| 960 |
+
- code review is a genuine engineering task
|
| 961 |
+
|
| 962 |
+
How to improve further:
|
| 963 |
+
- expand beyond tiny snippets into multi-function modules
|
| 964 |
+
- include operational and maintainability review, not just security lints
|
| 965 |
+
|
| 966 |
+
### Requirement: OpenEnv Spec Compliance
|
| 967 |
+
|
| 968 |
+
Status:
|
| 969 |
+
- mostly implemented in code
|
| 970 |
+
|
| 971 |
+
Implemented pieces:
|
| 972 |
+
- typed action model
|
| 973 |
+
- typed observation model
|
| 974 |
+
- `reset()`
|
| 975 |
+
- `step()`
|
| 976 |
+
- `state`
|
| 977 |
+
- `openenv.yaml`
|
| 978 |
+
- FastAPI/OpenEnv routes
|
| 979 |
+
|
| 980 |
+
What you still need to verify:
|
| 981 |
+
- `openenv validate`
|
| 982 |
+
- schema compatibility under your installed OpenEnv version
|
| 983 |
+
|
| 984 |
+
### Requirement: Minimum 3 Tasks With Agent Graders
|
| 985 |
+
|
| 986 |
+
Status:
|
| 987 |
+
- implemented
|
| 988 |
+
|
| 989 |
+
You have:
|
| 990 |
+
- easy
|
| 991 |
+
- medium
|
| 992 |
+
- hard
|
| 993 |
+
- deterministic grader returning `0.0` to `1.0`
|
| 994 |
+
|
| 995 |
+
### Requirement: Meaningful Reward Function
|
| 996 |
+
|
| 997 |
+
Status:
|
| 998 |
+
- implemented
|
| 999 |
+
|
| 1000 |
+
Current reward signals:
|
| 1001 |
+
- new rubric matches
|
| 1002 |
+
- false positive penalties
|
| 1003 |
+
- duplicate penalties
|
| 1004 |
+
- hint penalties
|
| 1005 |
+
- patch bonus
|
| 1006 |
+
- finalize pass bonus
|
| 1007 |
+
|
| 1008 |
+
### Requirement: Baseline Inference Script
|
| 1009 |
+
|
| 1010 |
+
Status:
|
| 1011 |
+
- implemented
|
| 1012 |
+
|
| 1013 |
+
Current `inference.py`:
|
| 1014 |
+
- uses OpenAI client
|
| 1015 |
+
- reads env vars
|
| 1016 |
+
- runs tasks
|
| 1017 |
+
- writes report
|
| 1018 |
+
|
| 1019 |
+
What to verify:
|
| 1020 |
+
- actual runtime under 20 minutes
|
| 1021 |
+
- reproducible output with your chosen model endpoint
|
| 1022 |
+
|
| 1023 |
+
### Requirement: HF Spaces + Docker
|
| 1024 |
+
|
| 1025 |
+
Status:
|
| 1026 |
+
- code is prepared
|
| 1027 |
+
|
| 1028 |
+
You still need to verify:
|
| 1029 |
+
|
| 1030 |
+
- `docker build -f server/Dockerfile .`
|
| 1031 |
+
- local container startup
|
| 1032 |
+
- `openenv push`
|
| 1033 |
+
- `/health` returns 200 on the deployed Space
|
| 1034 |
+
|
| 1035 |
+
## 11. Recommended Manual Validation Checklist
|
| 1036 |
+
|
| 1037 |
+
Before submission, run these in order:
|
| 1038 |
+
|
| 1039 |
+
1. Start server locally
|
| 1040 |
+
2. Hit `/health`
|
| 1041 |
+
3. Hit `/docs`
|
| 1042 |
+
4. Test `/tasks`
|
| 1043 |
+
5. Test `/review` with unsafe examples
|
| 1044 |
+
6. Test `/reset`
|
| 1045 |
+
7. Test `/step` with partial findings
|
| 1046 |
+
8. Test `/step` with finalize
|
| 1047 |
+
9. Test `/tasks/{task_id}/grade`
|
| 1048 |
+
10. Run `pytest`
|
| 1049 |
+
11. Run `openenv validate`
|
| 1050 |
+
12. Run `python inference.py`
|
| 1051 |
+
13. Build Docker image
|
| 1052 |
+
14. Deploy to Hugging Face Space
|
| 1053 |
+
15. Re-test `/health` and `/reset` on the live Space
|
| 1054 |
+
|
| 1055 |
+
## 12. Suggested Immediate Next Steps
|
| 1056 |
+
|
| 1057 |
+
If you want the environment to become stronger quickly, do this next:
|
| 1058 |
+
|
| 1059 |
+
1. Add 10 to 20 more benchmark-style tasks in `server/task_bank.py`
|
| 1060 |
+
2. Save solved and failed trajectories as JSONL files under a new `dataset/` directory
|
| 1061 |
+
3. Refactor custom route state so `/history` and OpenEnv `/step` share one coherent session story
|
| 1062 |
+
4. Run `openenv validate`
|
| 1063 |
+
5. Run `inference.py` against your local server and inspect the report
|
| 1064 |
+
|
| 1065 |
+
## 13. Quick Commands Summary
|
| 1066 |
+
|
| 1067 |
+
Start server:
|
| 1068 |
+
|
| 1069 |
+
```powershell
|
| 1070 |
+
uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
|
| 1071 |
+
```
|
| 1072 |
+
|
| 1073 |
+
Open docs:
|
| 1074 |
+
|
| 1075 |
+
```text
|
| 1076 |
+
http://127.0.0.1:8000/docs
|
| 1077 |
+
```
|
| 1078 |
+
|
| 1079 |
+
Run example tests:
|
| 1080 |
+
|
| 1081 |
+
```powershell
|
| 1082 |
+
python -m pytest tests -q
|
| 1083 |
+
```
|
| 1084 |
+
|
| 1085 |
+
Run inference locally:
|
| 1086 |
+
|
| 1087 |
+
```powershell
|
| 1088 |
+
$env:API_BASE_URL="https://api.openai.com/v1"
|
| 1089 |
+
$env:MODEL_NAME="gpt-4.1-mini"
|
| 1090 |
+
$env:OPENAI_API_KEY="your_key"
|
| 1091 |
+
$env:ENV_BASE_URL="http://127.0.0.1:8000"
|
| 1092 |
+
python inference.py
|
| 1093 |
+
```
|
| 1094 |
+
|
| 1095 |
+
Validate OpenEnv:
|
| 1096 |
+
|
| 1097 |
+
```powershell
|
| 1098 |
+
openenv validate
|
| 1099 |
+
```
|
| 1100 |
+
|
| 1101 |
+
Build Docker:
|
| 1102 |
+
|
| 1103 |
+
```powershell
|
| 1104 |
+
docker build -t python_env-env:latest -f server/Dockerfile .
|
| 1105 |
+
```
|
| 1106 |
+
|
| 1107 |
+
Deploy:
|
| 1108 |
+
|
| 1109 |
+
```powershell
|
| 1110 |
+
openenv push
|
| 1111 |
+
```
|
README.md
CHANGED
|
@@ -1,266 +1,272 @@
|
|
| 1 |
---
|
| 2 |
-
title: Python
|
| 3 |
-
emoji: 🎶
|
| 4 |
-
colorFrom: purple
|
| 5 |
-
colorTo: red
|
| 6 |
sdk: docker
|
| 7 |
-
pinned: false
|
| 8 |
app_port: 8000
|
| 9 |
base_path: /web
|
|
|
|
| 10 |
tags:
|
| 11 |
- openenv
|
|
|
|
| 12 |
---
|
| 13 |
|
| 14 |
-
# Python
|
| 15 |
-
|
| 16 |
-
A
|
| 17 |
-
|
| 18 |
-
##
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
-
|
| 50 |
-
-
|
| 51 |
-
-
|
| 52 |
-
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
#
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
###
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
###
|
| 126 |
-
|
| 127 |
-
-
|
| 128 |
-
-
|
| 129 |
-
-
|
| 130 |
-
-
|
| 131 |
-
-
|
| 132 |
-
|
| 133 |
-
##
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
#
|
| 140 |
-
|
| 141 |
-
#
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
#
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
``
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
#
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
```
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
###
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
return
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
-
|
| 227 |
-
-
|
| 228 |
-
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
``
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Python Code Review Environment Server
|
|
|
|
|
|
|
|
|
|
| 3 |
sdk: docker
|
|
|
|
| 4 |
app_port: 8000
|
| 5 |
base_path: /web
|
| 6 |
+
pinned: false
|
| 7 |
tags:
|
| 8 |
- openenv
|
| 9 |
+
- code-review
|
| 10 |
---
|
| 11 |
|
| 12 |
+
# Python Code Review Environment
|
| 13 |
+
|
| 14 |
+
A production-grade OpenEnv environment for Python code review, repair, and optimization tasks. This environment simulates real-world developer workflows where an AI agent reviews, fixes, and improves Python code.
|
| 15 |
+
|
| 16 |
+
## Overview
|
| 17 |
+
|
| 18 |
+
**`python_code_review_env`** is a deterministic benchmark environment featuring:
|
| 19 |
+
|
| 20 |
+
- ✅ **3 real-world tasks** with increasing difficulty (Syntax, Bug Fix, Optimization)
|
| 21 |
+
- ✅ **Deterministic graders** using AST analysis, pytest execution, and performance benchmarking
|
| 22 |
+
- ✅ **OpenAI-compatible API** supporting free/open models (Gemini, DeepSeek, Together, OpenRouter)
|
| 23 |
+
- ✅ **Production-ready Docker** deployment for Hugging Face Spaces
|
| 24 |
+
- ✅ **Structured Observations & Actions** following OpenEnv spec
|
| 25 |
+
- ✅ **Rich reward shaping** with bonuses for syntax fixes, test passes, and optimization
|
| 26 |
+
|
| 27 |
+
## Tasks
|
| 28 |
+
|
| 29 |
+
### 1. 🟢 Easy: Syntax Fixing
|
| 30 |
+
|
| 31 |
+
**Task ID**: `syntax-fix-easy`
|
| 32 |
+
|
| 33 |
+
Fix broken Python code with syntax errors.
|
| 34 |
+
|
| 35 |
+
- **Difficulty**: Easy
|
| 36 |
+
- **Goal**: Repair syntax errors to make code compile
|
| 37 |
+
- **Starter Code**: Function with missing closing parenthesis
|
| 38 |
+
- **Grading**: Compilation check + code similarity to reference
|
| 39 |
+
- **Score Range**: 0.0–1.0
|
| 40 |
+
|
| 41 |
+
### 2. 🟡 Medium: Bug Fixing
|
| 42 |
+
|
| 43 |
+
**Task ID**: `bug-fix-medium`
|
| 44 |
+
|
| 45 |
+
Fix logic bugs with visible and hidden test cases.
|
| 46 |
+
|
| 47 |
+
- **Difficulty**: Medium
|
| 48 |
+
- **Goal**: Repair a logic error in invoice calculation
|
| 49 |
+
- **Starter Code**: Function that returns wrong total (returns subtotal instead of discounted)
|
| 50 |
+
- **Grading**: Test pass fraction (visible & hidden)
|
| 51 |
+
- **Score Range**: 0.0–1.0
|
| 52 |
+
|
| 53 |
+
### 3. 🔴 Hard: Optimization & Refactoring
|
| 54 |
+
|
| 55 |
+
**Task ID**: `optimization-hard`
|
| 56 |
+
|
| 57 |
+
Optimize inefficient code while maintaining correctness.
|
| 58 |
+
|
| 59 |
+
- **Difficulty**: Hard
|
| 60 |
+
- **Goal**: Convert O(n²) duplicate removal to O(n) with set
|
| 61 |
+
- **Starter Code**: Slow nested-loop implementation
|
| 62 |
+
- **Grading**: 50% correctness + 30% speedup + 15% code quality + 5% style
|
| 63 |
+
- **Score Range**: 0.0–1.0
|
| 64 |
+
- **Bonus**: Runtime benchmarking against reference implementation
|
| 65 |
+
|
| 66 |
+
## Quick Start
|
| 67 |
+
|
| 68 |
+
### Run Locally
|
| 69 |
+
|
| 70 |
+
```bash
|
| 71 |
+
cd python-code-review-env
|
| 72 |
+
pip install -r server/requirements.txt
|
| 73 |
+
python -m server.app
|
| 74 |
+
```
|
| 75 |
+
|
| 76 |
+
Visit http://localhost:8000/docs for interactive API
|
| 77 |
+
|
| 78 |
+
### Run with Docker
|
| 79 |
+
|
| 80 |
+
```bash
|
| 81 |
+
docker build -f server/Dockerfile -t python_code_review_env:latest .
|
| 82 |
+
docker run -p 8000:8000 python_code_review_env:latest
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
### Run Inference
|
| 86 |
+
|
| 87 |
+
```bash
|
| 88 |
+
python inference.py --model "gpt-3.5-turbo" --base-url "http://localhost:8000/v1"
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
## OpenEnv Specification
|
| 92 |
+
|
| 93 |
+
### Observation
|
| 94 |
+
|
| 95 |
+
```json
|
| 96 |
+
{
|
| 97 |
+
"task_id": "syntax-fix-easy",
|
| 98 |
+
"difficulty": "easy",
|
| 99 |
+
"task_description": "Fix syntax errors...",
|
| 100 |
+
"current_code": "def normalize_username(raw_name: str) -> str:\n cleaned = raw_name.strip().lower(\n ...",
|
| 101 |
+
"errors": "invalid syntax ( line 2, column 40 )",
|
| 102 |
+
"test_results": "Not run yet.",
|
| 103 |
+
"visible_tests": ["normalize_username(' Alice Smith ') == 'alice_smith'"],
|
| 104 |
+
"history": [],
|
| 105 |
+
"attempts_remaining": 8,
|
| 106 |
+
"score": 0.0,
|
| 107 |
+
"reward": {
|
| 108 |
+
"value": 0.0,
|
| 109 |
+
"reason": "Episode reset."
|
| 110 |
+
}
|
| 111 |
+
}
|
| 112 |
+
```
|
| 113 |
+
|
| 114 |
+
### Action
|
| 115 |
+
|
| 116 |
+
```json
|
| 117 |
+
{
|
| 118 |
+
"action_type": "edit_code",
|
| 119 |
+
"code": "def normalize_username(raw_name: str) -> str:\n cleaned = raw_name.strip().lower()\n if not cleaned:\n return \"anonymous\"\n return cleaned.replace(\" \", \"_\")"
|
| 120 |
+
}
|
| 121 |
+
```
|
| 122 |
+
|
| 123 |
+
### Reward Details
|
| 124 |
+
|
| 125 |
+
- **+0.2**: Syntax fixed (one-time per episode)
|
| 126 |
+
- **+0.15**: Passing additional test (cumulative per test)
|
| 127 |
+
- **+0.1**: Code quality improvement
|
| 128 |
+
- **+0.5**: Full correctness (100% hidden tests, one-time)
|
| 129 |
+
- **-0.1**: Invalid action
|
| 130 |
+
|
| 131 |
+
## Architecture
|
| 132 |
+
|
| 133 |
+
```
|
| 134 |
+
python_code_review_env/
|
| 135 |
+
├── models.py # Pydantic models (Observation, Action, Reward)
|
| 136 |
+
├── server/
|
| 137 |
+
│ ├── app.py # FastAPI server
|
| 138 |
+
│ ├── env.py # OpenEnv environment
|
| 139 |
+
│ ├── Dockerfile # Docker config
|
| 140 |
+
│ └── requirements.txt
|
| 141 |
+
├── graders/
|
| 142 |
+
│ ├── common.py # Shared utilities
|
| 143 |
+
│ ├── syntax.py # Syntax/bug graders
|
| 144 |
+
│ ├── optimization.py# Optimization grader
|
| 145 |
+
│ └── pytest_runner.py
|
| 146 |
+
├── tasks/
|
| 147 |
+
│ ├── task_bank.py # 3 deterministic tasks
|
| 148 |
+
│ └── __init__.py
|
| 149 |
+
├── inference.py # Baseline evaluation script
|
| 150 |
+
├── openenv.yaml # OpenEnv spec
|
| 151 |
+
├── pyproject.toml # Project metadata
|
| 152 |
+
└── README.md
|
| 153 |
+
```
|
| 154 |
+
|
| 155 |
+
## FastAPI Endpoints
|
| 156 |
+
|
| 157 |
+
- `GET /health` – Health check
|
| 158 |
+
- `GET /tasks` – List all tasks
|
| 159 |
+
- `GET /tasks/{task_id}` – Get task details
|
| 160 |
+
- `POST /tasks/{task_id}/grade` – Grade code offline
|
| 161 |
+
- Standard OpenEnv endpoints (`/reset`, `/step`, `/state`)
|
| 162 |
+
|
| 163 |
+
## Deterministic Graders
|
| 164 |
+
|
| 165 |
+
### Syntax Fix
|
| 166 |
+
```
|
| 167 |
+
if code compiles:
|
| 168 |
+
score = 1.0
|
| 169 |
+
else:
|
| 170 |
+
score = 0.15 + 0.55 * similarity_to_reference
|
| 171 |
+
```
|
| 172 |
+
|
| 173 |
+
### Bug Fix
|
| 174 |
+
```
|
| 175 |
+
score = test_pass_fraction (0.0 to 1.0)
|
| 176 |
+
```
|
| 177 |
+
|
| 178 |
+
### Optimization
|
| 179 |
+
```
|
| 180 |
+
score = (
|
| 181 |
+
0.5 * test_fraction +
|
| 182 |
+
0.3 * speedup_score +
|
| 183 |
+
0.15 * code_quality +
|
| 184 |
+
0.05 * pep8_style
|
| 185 |
+
)
|
| 186 |
+
```
|
| 187 |
+
|
| 188 |
+
## Examples
|
| 189 |
+
|
| 190 |
+
### Using Python
|
| 191 |
+
|
| 192 |
+
```python
|
| 193 |
+
from server.env import PythonCodeReviewEnvironment
|
| 194 |
+
from models import PythonCodeReviewAction
|
| 195 |
+
|
| 196 |
+
env = PythonCodeReviewEnvironment()
|
| 197 |
+
obs = env.reset(task_id="syntax-fix-easy")
|
| 198 |
+
|
| 199 |
+
action = PythonCodeReviewAction(
|
| 200 |
+
action_type="edit_code",
|
| 201 |
+
code="""def normalize_username(raw_name: str) -> str:
|
| 202 |
+
cleaned = raw_name.strip().lower()
|
| 203 |
+
if not cleaned:
|
| 204 |
+
return "anonymous"
|
| 205 |
+
return cleaned.replace(" ", "_")
|
| 206 |
+
"""
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
+
obs = env.step(action)
|
| 210 |
+
print(f"Score: {obs.score}")
|
| 211 |
+
print(f"Reward: {obs.reward.value:+.3f}")
|
| 212 |
+
```
|
| 213 |
+
|
| 214 |
+
### Using cURL
|
| 215 |
+
|
| 216 |
+
```bash
|
| 217 |
+
# Check health
|
| 218 |
+
curl http://localhost:8000/health
|
| 219 |
+
|
| 220 |
+
# List tasks
|
| 221 |
+
curl http://localhost:8000/tasks
|
| 222 |
+
|
| 223 |
+
# Grade code
|
| 224 |
+
curl -X POST http://localhost:8000/tasks/syntax-fix-easy/grade \
|
| 225 |
+
-H "Content-Type: application/json" \
|
| 226 |
+
-d '{"action_type": "edit_code", "code": "..."}'
|
| 227 |
+
```
|
| 228 |
+
|
| 229 |
+
## Deployment
|
| 230 |
+
|
| 231 |
+
### Hugging Face Spaces
|
| 232 |
+
|
| 233 |
+
1. Create Space > Docker
|
| 234 |
+
2. Upload files + `server/Dockerfile`
|
| 235 |
+
3. Space auto-deploys on CPU
|
| 236 |
+
4. Monitor `/health` endpoint
|
| 237 |
+
|
| 238 |
+
### Local Docker
|
| 239 |
+
|
| 240 |
+
```bash
|
| 241 |
+
docker build -f server/Dockerfile -t python_code_review_env .
|
| 242 |
+
docker run -p 8000:8000 \
|
| 243 |
+
-e MAX_CONCURRENT_ENVS=16 \
|
| 244 |
+
python_code_review_env
|
| 245 |
+
```
|
| 246 |
+
|
| 247 |
+
## Performance
|
| 248 |
+
|
| 249 |
+
- Startup: < 5s
|
| 250 |
+
- Reset: < 100ms
|
| 251 |
+
- Step: 50ms–3s (depends on action)
|
| 252 |
+
- Inference (3 tasks): < 20 minutes
|
| 253 |
+
- CPU: Works on 2 vCPU, 8GB RAM
|
| 254 |
+
|
| 255 |
+
## Validation Checklist
|
| 256 |
+
|
| 257 |
+
- ✅ 3 deterministic tasks
|
| 258 |
+
- ✅ Deterministic graders (AST, pytest, benchmarks)
|
| 259 |
+
- ✅ `/health` → 200
|
| 260 |
+
- ✅ Scores vary per task (not constant)
|
| 261 |
+
- ✅ Docker builds successfully
|
| 262 |
+
- ✅ OpenEnv spec compliant
|
| 263 |
+
- ✅ Reward shaping working
|
| 264 |
+
- ✅ All tests deterministic and reproducible
|
| 265 |
+
|
| 266 |
+
## License
|
| 267 |
+
|
| 268 |
+
MIT
|
| 269 |
+
|
| 270 |
+
---
|
| 271 |
+
|
| 272 |
+
**Built for production. Deterministic. Deployable. Extensible.**
|
REWARD_SYSTEM_GUIDE.md
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Reward System Implementation Guide
|
| 2 |
+
|
| 3 |
+
This document shows how the reward system is implemented in code and how to use it.
|
| 4 |
+
|
| 5 |
+
## Module Documentation
|
| 6 |
+
|
| 7 |
+
The reward system architecture is documented at the module level:
|
| 8 |
+
|
| 9 |
+
```python
|
| 10 |
+
import server.env
|
| 11 |
+
print(server.env.__doc__)
|
| 12 |
+
```
|
| 13 |
+
|
| 14 |
+
Output shows all 6 reward components and the final computation formula.
|
| 15 |
+
|
| 16 |
+
## Reward Constants
|
| 17 |
+
|
| 18 |
+
All reward constants are defined in `server/env.py` (lines 57-87):
|
| 19 |
+
|
| 20 |
+
```python
|
| 21 |
+
# Component 1: Score improvement reward
|
| 22 |
+
PROGRESS_SCALE = 0.25
|
| 23 |
+
|
| 24 |
+
# Component 2: Syntax/compilation fix reward
|
| 25 |
+
SYNTAX_FIX_BONUS = 0.35
|
| 26 |
+
|
| 27 |
+
# Component 3: Test improvement reward
|
| 28 |
+
TEST_PASS_REWARD_SCALE = 0.30
|
| 29 |
+
|
| 30 |
+
# Component 4: Code quality reward
|
| 31 |
+
QUALITY_BONUS_SCALE = 0.15
|
| 32 |
+
|
| 33 |
+
# Component 5: Stagnation penalty
|
| 34 |
+
STAGNATION_PENALTY = 0.10
|
| 35 |
+
|
| 36 |
+
# Component 6: Regression penalty
|
| 37 |
+
REGRESSION_PENALTY_SCALE = 0.20
|
| 38 |
+
|
| 39 |
+
# One-time completion bonus
|
| 40 |
+
COMPLETION_BONUS = 0.50
|
| 41 |
+
|
| 42 |
+
# Invalid/error penalties
|
| 43 |
+
INVALID_ACTION_PENALTY = 0.15
|
| 44 |
+
TIMEOUT_PENALTY = 0.15
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
To tune the reward system, edit these constants and re-test.
|
| 48 |
+
|
| 49 |
+
## RewardDetails Model Documentation
|
| 50 |
+
|
| 51 |
+
Located in `models.py` (lines 26-80):
|
| 52 |
+
|
| 53 |
+
```python
|
| 54 |
+
from models import RewardDetails
|
| 55 |
+
print(RewardDetails.__doc__)
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
Shows all 15 fields and their meanings:
|
| 59 |
+
- `value`: Final scalar reward [-1.0, +1.0]
|
| 60 |
+
- `progress_delta`: Score improvement component
|
| 61 |
+
- `syntax_reward`: Syntax fix bonus
|
| 62 |
+
- `test_reward`: Test improvement bonus
|
| 63 |
+
- `quality_bonus`: Code quality improvement
|
| 64 |
+
- `stagnation_penalty`: Unchanged code penalty
|
| 65 |
+
- `regression_penalty`: Score decline penalty
|
| 66 |
+
- `reason`: Human-readable explanation
|
| 67 |
+
- `prev_score`, `curr_score`: Score before/after
|
| 68 |
+
- `code_changed`: Whether code was modified
|
| 69 |
+
|
| 70 |
+
## Core Computation Method
|
| 71 |
+
|
| 72 |
+
The main reward computation is in `_compute_reward_components()` (server/env.py, lines 507-703):
|
| 73 |
+
|
| 74 |
+
```python
|
| 75 |
+
def _compute_reward_components(
|
| 76 |
+
self,
|
| 77 |
+
curr_score: float,
|
| 78 |
+
prev_score: float,
|
| 79 |
+
curr_grade: TaskGrade,
|
| 80 |
+
code_changed: bool,
|
| 81 |
+
prev_grade_score: float = 0.0,
|
| 82 |
+
) -> dict:
|
| 83 |
+
"""Compute all six reward components and return combined result."""
|
| 84 |
+
```
|
| 85 |
+
|
| 86 |
+
### What It Does
|
| 87 |
+
|
| 88 |
+
1. **Initializes** empty component dict
|
| 89 |
+
2. **Computes each component**:
|
| 90 |
+
- Progress: Score improvement scaled by PROGRESS_SCALE
|
| 91 |
+
- Syntax: One-time bonus if first compile
|
| 92 |
+
- Test: Test pass rate improvement scaled by TEST_PASS_REWARD_SCALE
|
| 93 |
+
- Quality: Code quality improvement scaled by QUALITY_BONUS_SCALE
|
| 94 |
+
- Stagnation: Penalty if code unchanged
|
| 95 |
+
- Regression: Penalty if score decreased
|
| 96 |
+
3. **Combines**: Sums positives, subtracts negatives
|
| 97 |
+
4. **Clamps**: Bounds result to [-1.0, +1.0]
|
| 98 |
+
|
| 99 |
+
### Key Design Decisions
|
| 100 |
+
|
| 101 |
+
- **Monotonic tracking**: Best test rate and quality in episode are tracked
|
| 102 |
+
- **One-time bonuses**: Syntax reward awarded once per episode
|
| 103 |
+
- **Scale capping**: Each component has a maximum (e.g., progress max +0.25)
|
| 104 |
+
- **Timeout handling**: Special penalty instead of score-based
|
| 105 |
+
- **Clamping**: Final reward bounded for numerical stability
|
| 106 |
+
|
| 107 |
+
## Debug Logging
|
| 108 |
+
|
| 109 |
+
When `verbose=True`, the environment prints detailed debug output via `_log_debug_step()`:
|
| 110 |
+
|
| 111 |
+
```python
|
| 112 |
+
env = PythonCodeReviewEnvironment(verbose=True)
|
| 113 |
+
obs = env.reset()
|
| 114 |
+
obs = env.step(action)
|
| 115 |
+
```
|
| 116 |
+
|
| 117 |
+
Output format:
|
| 118 |
+
```
|
| 119 |
+
Step 1 | Score: 0.698 | Delta: +0.698 | Reward: +0.4239 | Changed: False
|
| 120 |
+
| Progress=+0.174 | Quality=+0.149 | Stagnation=+0.100
|
| 121 |
+
| Reason: Syntax error detected: '(' was never closed
|
| 122 |
+
```
|
| 123 |
+
|
| 124 |
+
Shows:
|
| 125 |
+
- Step number
|
| 126 |
+
- Current score and delta from previous
|
| 127 |
+
- Final reward value
|
| 128 |
+
- Whether code changed
|
| 129 |
+
- Non-zero components only
|
| 130 |
+
- Human-readable reason
|
| 131 |
+
|
| 132 |
+
## Example: Full Episode with Rewards
|
| 133 |
+
|
| 134 |
+
```python
|
| 135 |
+
from server.env import PythonCodeReviewEnvironment
|
| 136 |
+
from models import PythonCodeReviewAction
|
| 137 |
+
|
| 138 |
+
env = PythonCodeReviewEnvironment(verbose=True)
|
| 139 |
+
obs = env.reset(task_id='syntax-fix-easy')
|
| 140 |
+
|
| 141 |
+
# Step 1: Analyze (no code change)
|
| 142 |
+
action = PythonCodeReviewAction(action_type='analyze_code')
|
| 143 |
+
obs = env.step(action)
|
| 144 |
+
print(f"Reward 1: {obs.reward_details.value:.4f}")
|
| 145 |
+
|
| 146 |
+
# Step 2: Edit with fix
|
| 147 |
+
code = 'x = 1; y = 2; print(x + y)'
|
| 148 |
+
action = PythonCodeReviewAction(action_type='edit_code', code=code)
|
| 149 |
+
obs = env.step(action)
|
| 150 |
+
print(f"Reward 2: {obs.reward_details.value:.4f}")
|
| 151 |
+
|
| 152 |
+
# Step 3: Submit
|
| 153 |
+
action = PythonCodeReviewAction(action_type='submit_solution')
|
| 154 |
+
obs = env.step(action)
|
| 155 |
+
print(f"Final Reward: {obs.reward_details.value:.4f}")
|
| 156 |
+
```
|
| 157 |
+
|
| 158 |
+
## Interpreting Rewards
|
| 159 |
+
|
| 160 |
+
### Positive Rewards (+0 to +1.0)
|
| 161 |
+
- **+0.5 - +1.0**: Major progress (syntax fix, many tests passing)
|
| 162 |
+
- **+0.2 - +0.5**: Good progress (score improvement, test gains)
|
| 163 |
+
- **+0.0 - +0.2**: Small progress (quality improvement, minor gains)
|
| 164 |
+
|
| 165 |
+
### Negative Rewards (−1.0 to −0)
|
| 166 |
+
- **−0.1 - 0**: Stagnation (analyzed without changing code)
|
| 167 |
+
- **−0.2 - −0.1**: Slight regression (small score drop)
|
| 168 |
+
- **−0.5 - −0.2**: Major regression (significant score drop)
|
| 169 |
+
- **−1.0 - −0.5**: Invalid action or timeout
|
| 170 |
+
|
| 171 |
+
## Tuning the Reward System
|
| 172 |
+
|
| 173 |
+
### For Faster Early Learning
|
| 174 |
+
↑ Increase `SYNTAX_FIX_BONUS` and `COMPLETION_BONUS`
|
| 175 |
+
|
| 176 |
+
### To Encourage Editing Over Analysis
|
| 177 |
+
↑ Increase `STAGNATION_PENALTY`
|
| 178 |
+
|
| 179 |
+
### To Reward Test Improvements More
|
| 180 |
+
↑ Increase `TEST_PASS_REWARD_SCALE`
|
| 181 |
+
|
| 182 |
+
### To Penalize Mistakes More
|
| 183 |
+
↑ Increase `REGRESSION_PENALTY_SCALE`
|
| 184 |
+
|
| 185 |
+
### To Balance All Components
|
| 186 |
+
Adjust the Scale constants (all in range 0.15-0.35 for stability)
|
| 187 |
+
|
| 188 |
+
## Accessing Documentation Programmatically
|
| 189 |
+
|
| 190 |
+
```python
|
| 191 |
+
from server.env import PythonCodeReviewEnvironment
|
| 192 |
+
from models import RewardDetails
|
| 193 |
+
import server.env
|
| 194 |
+
|
| 195 |
+
# Module-level architecture
|
| 196 |
+
print(server.env.__doc__)
|
| 197 |
+
|
| 198 |
+
# RewardDetails fields
|
| 199 |
+
print(RewardDetails.__doc__)
|
| 200 |
+
|
| 201 |
+
# One method
|
| 202 |
+
env = PythonCodeReviewEnvironment()
|
| 203 |
+
help(env._compute_reward_components)
|
| 204 |
+
```
|
| 205 |
+
|
| 206 |
+
All major functions and classes have comprehensive docstrings.
|
__init__.py
CHANGED
|
@@ -1,16 +1,40 @@
|
|
| 1 |
-
|
| 2 |
-
# All rights reserved.
|
| 3 |
-
#
|
| 4 |
-
# This source code is licensed under the BSD-style license found in the
|
| 5 |
-
# LICENSE file in the root directory of this source tree.
|
| 6 |
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
from .
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
__all__ = [
|
| 13 |
-
"PythonAction",
|
| 14 |
-
"PythonObservation",
|
| 15 |
"PythonEnv",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
]
|
|
|
|
| 1 |
+
"""Public package API for the Python code review OpenEnv benchmark."""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
+
try:
|
| 4 |
+
from .client import CodeReviewEnv, MyEnv, PythonEnv
|
| 5 |
+
from .models import (
|
| 6 |
+
HealthResponse,
|
| 7 |
+
HistoryEntry,
|
| 8 |
+
PythonCodeReviewAction,
|
| 9 |
+
PythonCodeReviewObservation,
|
| 10 |
+
PythonCodeReviewState,
|
| 11 |
+
RewardDetails,
|
| 12 |
+
TaskDescriptor,
|
| 13 |
+
TaskGrade,
|
| 14 |
+
)
|
| 15 |
+
except ImportError: # pragma: no cover
|
| 16 |
+
from client import CodeReviewEnv, MyEnv, PythonEnv
|
| 17 |
+
from models import (
|
| 18 |
+
HealthResponse,
|
| 19 |
+
HistoryEntry,
|
| 20 |
+
PythonCodeReviewAction,
|
| 21 |
+
PythonCodeReviewObservation,
|
| 22 |
+
PythonCodeReviewState,
|
| 23 |
+
RewardDetails,
|
| 24 |
+
TaskDescriptor,
|
| 25 |
+
TaskGrade,
|
| 26 |
+
)
|
| 27 |
|
| 28 |
__all__ = [
|
|
|
|
|
|
|
| 29 |
"PythonEnv",
|
| 30 |
+
"CodeReviewEnv",
|
| 31 |
+
"MyEnv",
|
| 32 |
+
"PythonCodeReviewAction",
|
| 33 |
+
"PythonCodeReviewObservation",
|
| 34 |
+
"PythonCodeReviewState",
|
| 35 |
+
HealthResponse,
|
| 36 |
+
HistoryEntry,
|
| 37 |
+
RewardDetails,
|
| 38 |
+
TaskDescriptor,
|
| 39 |
+
TaskGrade,
|
| 40 |
]
|
client.py
CHANGED
|
@@ -1,46 +1,75 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
# This source code is licensed under the BSD-style license found in the
|
| 5 |
-
# LICENSE file in the root directory of this source tree.
|
| 6 |
|
| 7 |
-
|
| 8 |
|
| 9 |
-
from
|
|
|
|
|
|
|
| 10 |
|
| 11 |
from openenv.core import EnvClient
|
| 12 |
from openenv.core.client_types import StepResult
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Client for the Python code review environment."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
|
|
|
|
|
|
| 4 |
|
| 5 |
+
from typing import Dict
|
| 6 |
|
| 7 |
+
from compat import install_openenv_fastmcp_compat
|
| 8 |
+
|
| 9 |
+
install_openenv_fastmcp_compat()
|
| 10 |
|
| 11 |
from openenv.core import EnvClient
|
| 12 |
from openenv.core.client_types import StepResult
|
| 13 |
+
|
| 14 |
+
from models import (
|
| 15 |
+
HistoryEntry,
|
| 16 |
+
PythonCodeReviewAction,
|
| 17 |
+
PythonCodeReviewObservation,
|
| 18 |
+
PythonCodeReviewState,
|
| 19 |
+
RewardDetails,
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class PythonEnv(
|
| 24 |
+
EnvClient[PythonCodeReviewAction, PythonCodeReviewObservation, PythonCodeReviewState]
|
| 25 |
+
):
|
| 26 |
+
"""OpenEnv HTTP client for the Python code review benchmark."""
|
| 27 |
+
|
| 28 |
+
def _step_payload(self, action: PythonCodeReviewAction) -> Dict:
|
| 29 |
+
return action.model_dump(exclude_none=True)
|
| 30 |
+
|
| 31 |
+
def _parse_result(self, payload: Dict) -> StepResult[PythonCodeReviewObservation]:
|
| 32 |
+
obs = payload.get("observation", {})
|
| 33 |
+
observation = PythonCodeReviewObservation(
|
| 34 |
+
task_id=obs["task_id"],
|
| 35 |
+
title=obs["title"],
|
| 36 |
+
difficulty=obs["difficulty"],
|
| 37 |
+
task_kind=obs["task_kind"],
|
| 38 |
+
task_description=obs["task_description"],
|
| 39 |
+
current_code=obs.get("current_code", ""),
|
| 40 |
+
errors=obs.get("errors", ""),
|
| 41 |
+
test_results=obs.get("test_results", ""),
|
| 42 |
+
history=[HistoryEntry(**entry) for entry in obs.get("history", [])],
|
| 43 |
+
attempts_remaining=obs.get("attempts_remaining", 0),
|
| 44 |
+
last_action_status=obs.get("last_action_status", ""),
|
| 45 |
+
score=obs.get("score", 0.0),
|
| 46 |
+
reward_details=RewardDetails(**obs.get("reward_details", {})),
|
| 47 |
+
done=payload.get("done", obs.get("done", False)),
|
| 48 |
+
reward=payload.get("reward", obs.get("reward")),
|
| 49 |
+
metadata=obs.get("metadata", {}),
|
| 50 |
+
)
|
| 51 |
+
return StepResult(
|
| 52 |
+
observation=observation,
|
| 53 |
+
reward=payload.get("reward", obs.get("reward")),
|
| 54 |
+
done=payload.get("done", obs.get("done", False)),
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
def _parse_state(self, payload: Dict) -> PythonCodeReviewState:
|
| 58 |
+
return PythonCodeReviewState(
|
| 59 |
+
episode_id=payload.get("episode_id"),
|
| 60 |
+
step_count=payload.get("step_count", 0),
|
| 61 |
+
task_id=payload.get("task_id"),
|
| 62 |
+
difficulty=payload.get("difficulty"),
|
| 63 |
+
task_kind=payload.get("task_kind"),
|
| 64 |
+
attempts_remaining=payload.get("attempts_remaining", 0),
|
| 65 |
+
current_code=payload.get("current_code", ""),
|
| 66 |
+
errors=payload.get("errors", ""),
|
| 67 |
+
test_results=payload.get("test_results", ""),
|
| 68 |
+
history=[HistoryEntry(**entry) for entry in payload.get("history", [])],
|
| 69 |
+
score=payload.get("score", 0.0),
|
| 70 |
+
done=payload.get("done", False),
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
CodeReviewEnv = PythonEnv
|
| 75 |
+
MyEnv = PythonEnv
|
compat.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Compatibility helpers for OpenEnv and FastMCP runtime drift."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import sys
|
| 6 |
+
import types
|
| 7 |
+
from typing import Any, Optional
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def install_openenv_fastmcp_compat() -> None:
|
| 11 |
+
"""Patch FastMCP API differences so older OpenEnv builds keep importing."""
|
| 12 |
+
try:
|
| 13 |
+
import fastmcp # type: ignore
|
| 14 |
+
except Exception:
|
| 15 |
+
return
|
| 16 |
+
|
| 17 |
+
try:
|
| 18 |
+
if not hasattr(fastmcp, "Client"):
|
| 19 |
+
class CompatClient:
|
| 20 |
+
"""Minimal async MCP client used for legacy OpenEnv imports."""
|
| 21 |
+
|
| 22 |
+
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
| 23 |
+
self.args = args
|
| 24 |
+
self.kwargs = kwargs
|
| 25 |
+
|
| 26 |
+
async def __aenter__(self) -> "CompatClient":
|
| 27 |
+
return self
|
| 28 |
+
|
| 29 |
+
async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> bool:
|
| 30 |
+
return False
|
| 31 |
+
|
| 32 |
+
async def list_tools(self) -> list[Any]:
|
| 33 |
+
return []
|
| 34 |
+
|
| 35 |
+
async def call_tool(self, tool_name: str, arguments: dict[str, Any]) -> Any:
|
| 36 |
+
raise RuntimeError(
|
| 37 |
+
f"MCP client compatibility mode cannot call tool: {tool_name}"
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
fastmcp.Client = CompatClient # type: ignore[attr-defined]
|
| 41 |
+
except Exception:
|
| 42 |
+
pass
|
| 43 |
+
|
| 44 |
+
try:
|
| 45 |
+
client_pkg = sys.modules.get("fastmcp.client")
|
| 46 |
+
if client_pkg is None:
|
| 47 |
+
client_pkg = types.ModuleType("fastmcp.client")
|
| 48 |
+
sys.modules["fastmcp.client"] = client_pkg
|
| 49 |
+
|
| 50 |
+
client_mod = sys.modules.get("fastmcp.client.client")
|
| 51 |
+
if client_mod is None:
|
| 52 |
+
client_mod = types.ModuleType("fastmcp.client.client")
|
| 53 |
+
sys.modules["fastmcp.client.client"] = client_mod
|
| 54 |
+
|
| 55 |
+
if not hasattr(client_mod, "CallToolResult"):
|
| 56 |
+
class CallToolResult:
|
| 57 |
+
"""Compatibility container for legacy OpenEnv response handling."""
|
| 58 |
+
|
| 59 |
+
def __init__(
|
| 60 |
+
self,
|
| 61 |
+
content: Any = None,
|
| 62 |
+
structured_content: Any = None,
|
| 63 |
+
meta: Any = None,
|
| 64 |
+
data: Any = None,
|
| 65 |
+
is_error: bool = False,
|
| 66 |
+
) -> None:
|
| 67 |
+
self.content = content
|
| 68 |
+
self.structured_content = structured_content
|
| 69 |
+
self.meta = meta
|
| 70 |
+
self.data = data
|
| 71 |
+
self.is_error = is_error
|
| 72 |
+
|
| 73 |
+
client_mod.CallToolResult = CallToolResult
|
| 74 |
+
|
| 75 |
+
client_pkg.client = client_mod # type: ignore[attr-defined]
|
| 76 |
+
except Exception:
|
| 77 |
+
pass
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
install_openenv_fastmcp_compat()
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
try:
|
| 84 |
+
from openenv.core.env_server.http_server import create_app as openenv_create_app
|
| 85 |
+
from openenv.core.env_server.interfaces import Environment
|
| 86 |
+
from openenv.core.env_server.types import Action, Observation, State
|
| 87 |
+
except Exception as exc: # pragma: no cover
|
| 88 |
+
raise RuntimeError(f"OpenEnv runtime import failed after compatibility patch: {exc}") from exc
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
create_app = openenv_create_app
|
| 92 |
+
|
examples/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Example snippets for the Python review environment."""
|
examples/python_review_examples.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Example Python snippets for exercising the review environment."""
|
| 2 |
+
|
| 3 |
+
EXAMPLE_SNIPPETS = {
|
| 4 |
+
"unsafe_eval": "\n".join(
|
| 5 |
+
[
|
| 6 |
+
"def load_settings(config_text):",
|
| 7 |
+
" return eval(config_text)",
|
| 8 |
+
]
|
| 9 |
+
),
|
| 10 |
+
"mutable_default": "\n".join(
|
| 11 |
+
[
|
| 12 |
+
"def append_name(name, names=[]):",
|
| 13 |
+
" names.append(name)",
|
| 14 |
+
" return names",
|
| 15 |
+
]
|
| 16 |
+
),
|
| 17 |
+
"bare_except": "\n".join(
|
| 18 |
+
[
|
| 19 |
+
"def publish_report(report):",
|
| 20 |
+
" try:",
|
| 21 |
+
' return report[\"summary\"]',
|
| 22 |
+
" except:",
|
| 23 |
+
" return None",
|
| 24 |
+
]
|
| 25 |
+
),
|
| 26 |
+
"shell_injection": "\n".join(
|
| 27 |
+
[
|
| 28 |
+
"import subprocess",
|
| 29 |
+
"",
|
| 30 |
+
"def run_script(script_path, user_input):",
|
| 31 |
+
' cmd = f\"python {script_path} {user_input}\"',
|
| 32 |
+
" return subprocess.check_output(cmd, shell=True, text=True)",
|
| 33 |
+
]
|
| 34 |
+
),
|
| 35 |
+
"syntax_error": "\n".join(
|
| 36 |
+
[
|
| 37 |
+
"def broken_function(",
|
| 38 |
+
" return 42",
|
| 39 |
+
]
|
| 40 |
+
),
|
| 41 |
+
"clean_function": "\n".join(
|
| 42 |
+
[
|
| 43 |
+
"def normalize_name(name: str) -> str:",
|
| 44 |
+
" cleaned = name.strip().lower()",
|
| 45 |
+
" return cleaned.replace(\" \", \" \")",
|
| 46 |
+
]
|
| 47 |
+
),
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
EXPECTED_RULE_IDS = {
|
| 52 |
+
"unsafe_eval": {"avoid-eval"},
|
| 53 |
+
"mutable_default": {"mutable-default-list"},
|
| 54 |
+
"bare_except": {"bare-except"},
|
| 55 |
+
"shell_injection": {"shell-true-command-injection"},
|
| 56 |
+
"syntax_error": {"syntax-error"},
|
| 57 |
+
"clean_function": set(),
|
| 58 |
+
}
|
graders/__init__.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Deterministic graders for the Python code review environment."""
|
| 2 |
+
|
| 3 |
+
from .common import clamp_score
|
| 4 |
+
from .optimization import grade_optimization_task
|
| 5 |
+
from .pytest_runner import PytestExecution, run_pytest_suite
|
| 6 |
+
from .syntax import grade_bug_fix_task, grade_syntax_task, grade_task
|
| 7 |
+
|
| 8 |
+
__all__ = [
|
| 9 |
+
"PytestExecution",
|
| 10 |
+
"clamp_score",
|
| 11 |
+
"grade_bug_fix_task",
|
| 12 |
+
"grade_optimization_task",
|
| 13 |
+
"grade_syntax_task",
|
| 14 |
+
"grade_task",
|
| 15 |
+
"run_pytest_suite",
|
| 16 |
+
]
|
graders/common.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Shared deterministic scoring helpers."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import ast
|
| 6 |
+
import difflib
|
| 7 |
+
import traceback
|
| 8 |
+
from typing import Tuple
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def clamp_score(value: float) -> float:
|
| 12 |
+
"""Clamp any scalar score into the required 0..1 interval."""
|
| 13 |
+
|
| 14 |
+
return max(0.0, min(1.0, round(value, 6)))
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def syntax_error_message(code: str) -> str:
|
| 18 |
+
"""Return a concise syntax error string or an empty string."""
|
| 19 |
+
|
| 20 |
+
try:
|
| 21 |
+
ast.parse(code)
|
| 22 |
+
except SyntaxError as exc:
|
| 23 |
+
return f"{exc.msg} (line {exc.lineno}, column {exc.offset})"
|
| 24 |
+
except Exception: # pragma: no cover
|
| 25 |
+
return traceback.format_exc(limit=1).strip()
|
| 26 |
+
return ""
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def compiles(code: str) -> bool:
|
| 30 |
+
"""Return whether the code parses and compiles."""
|
| 31 |
+
|
| 32 |
+
try:
|
| 33 |
+
compile(code, "<candidate>", "exec")
|
| 34 |
+
except Exception:
|
| 35 |
+
return False
|
| 36 |
+
return True
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def normalized_diff_score(code: str, reference_code: str) -> float:
|
| 40 |
+
"""Score textual similarity to the reference solution."""
|
| 41 |
+
|
| 42 |
+
ratio = difflib.SequenceMatcher(
|
| 43 |
+
a="".join(code.split()),
|
| 44 |
+
b="".join(reference_code.split()),
|
| 45 |
+
).ratio()
|
| 46 |
+
return clamp_score(ratio)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def style_score(code: str, max_line_length: int = 88) -> float:
|
| 50 |
+
"""Simple deterministic PEP8-inspired style score."""
|
| 51 |
+
|
| 52 |
+
lines = code.splitlines() or [""]
|
| 53 |
+
line_length_ok = sum(1 for line in lines if len(line) <= max_line_length) / len(lines)
|
| 54 |
+
tab_ok = 1.0 if all("\t" not in line for line in lines) else 0.0
|
| 55 |
+
trailing_ws_ok = 1.0 if all(line == line.rstrip() for line in lines) else 0.0
|
| 56 |
+
return clamp_score((line_length_ok * 0.6) + (tab_ok * 0.2) + (trailing_ws_ok * 0.2))
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def nested_loop_depth(tree: ast.AST) -> int:
|
| 60 |
+
"""Return the maximum nested loop depth in the AST."""
|
| 61 |
+
|
| 62 |
+
best = 0
|
| 63 |
+
|
| 64 |
+
def walk(node: ast.AST, depth: int) -> None:
|
| 65 |
+
nonlocal best
|
| 66 |
+
if isinstance(node, (ast.For, ast.AsyncFor, ast.While)):
|
| 67 |
+
depth += 1
|
| 68 |
+
best = max(best, depth)
|
| 69 |
+
for child in ast.iter_child_nodes(node):
|
| 70 |
+
walk(child, depth)
|
| 71 |
+
|
| 72 |
+
walk(tree, 0)
|
| 73 |
+
return best
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def compile_tree(code: str) -> Tuple[ast.AST | None, str]:
|
| 77 |
+
"""Return AST tree and optional parse error."""
|
| 78 |
+
|
| 79 |
+
try:
|
| 80 |
+
return ast.parse(code), ""
|
| 81 |
+
except SyntaxError as exc:
|
| 82 |
+
return None, f"{exc.msg} (line {exc.lineno}, column {exc.offset})"
|
graders/optimization.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Deterministic grading for optimization and refactor tasks."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
import subprocess
|
| 7 |
+
import sys
|
| 8 |
+
import tempfile
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
from graders.common import clamp_score, compile_tree, nested_loop_depth, style_score
|
| 12 |
+
from graders.pytest_runner import run_pytest_suite
|
| 13 |
+
from models import TaskGrade
|
| 14 |
+
from tasks.task_bank import TaskSpec
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def _benchmark_script(task: TaskSpec) -> str:
|
| 18 |
+
return f"""import json
|
| 19 |
+
import time
|
| 20 |
+
from candidate import {task.benchmark_entrypoint}
|
| 21 |
+
|
| 22 |
+
{task.benchmark_builder}
|
| 23 |
+
|
| 24 |
+
events = build_benchmark_events()
|
| 25 |
+
start = time.perf_counter()
|
| 26 |
+
for _ in range({task.benchmark_repeats}):
|
| 27 |
+
result = {task.benchmark_entrypoint}(events)
|
| 28 |
+
elapsed = time.perf_counter() - start
|
| 29 |
+
Path = __import__("pathlib").Path
|
| 30 |
+
Path("benchmark.json").write_text(json.dumps({{"elapsed": elapsed, "rows": len(result)}}), encoding="utf-8")
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def benchmark_runtime(candidate_code: str, task: TaskSpec) -> tuple[float, bool, str]:
|
| 35 |
+
"""Benchmark runtime deterministically against the starter implementation."""
|
| 36 |
+
|
| 37 |
+
assert task.benchmark_entrypoint is not None
|
| 38 |
+
try:
|
| 39 |
+
with tempfile.TemporaryDirectory(prefix="python-code-review-bench-") as temp_dir:
|
| 40 |
+
temp_path = Path(temp_dir)
|
| 41 |
+
(temp_path / "candidate.py").write_text(candidate_code, encoding="utf-8")
|
| 42 |
+
(temp_path / "starter.py").write_text(task.starter_code, encoding="utf-8")
|
| 43 |
+
(temp_path / "candidate_runner.py").write_text(_benchmark_script(task), encoding="utf-8")
|
| 44 |
+
|
| 45 |
+
starter_script = _benchmark_script(task).replace("from candidate import", "from starter import")
|
| 46 |
+
(temp_path / "starter_runner.py").write_text(starter_script, encoding="utf-8")
|
| 47 |
+
|
| 48 |
+
try:
|
| 49 |
+
starter_run = subprocess.run(
|
| 50 |
+
[sys.executable, "starter_runner.py"],
|
| 51 |
+
cwd=temp_path,
|
| 52 |
+
capture_output=True,
|
| 53 |
+
text=True,
|
| 54 |
+
timeout=task.benchmark_timeout_s,
|
| 55 |
+
check=False,
|
| 56 |
+
)
|
| 57 |
+
starter_payload = json.loads((temp_path / "benchmark.json").read_text(encoding="utf-8"))
|
| 58 |
+
|
| 59 |
+
candidate_run = subprocess.run(
|
| 60 |
+
[sys.executable, "candidate_runner.py"],
|
| 61 |
+
cwd=temp_path,
|
| 62 |
+
capture_output=True,
|
| 63 |
+
text=True,
|
| 64 |
+
timeout=task.benchmark_timeout_s,
|
| 65 |
+
check=False,
|
| 66 |
+
)
|
| 67 |
+
candidate_payload = json.loads((temp_path / "benchmark.json").read_text(encoding="utf-8"))
|
| 68 |
+
except subprocess.TimeoutExpired as exc:
|
| 69 |
+
output = (exc.stdout or "") + (exc.stderr or "")
|
| 70 |
+
return 0.0, True, (output or "benchmark timed out").strip()
|
| 71 |
+
except Exception as exc: # pragma: no cover
|
| 72 |
+
return 0.0, False, str(exc)
|
| 73 |
+
|
| 74 |
+
starter_elapsed = max(float(starter_payload["elapsed"]), 1e-9)
|
| 75 |
+
candidate_elapsed = max(float(candidate_payload["elapsed"]), 1e-9)
|
| 76 |
+
speedup = starter_elapsed / candidate_elapsed
|
| 77 |
+
runtime_score = clamp_score(min((speedup - 1.0) / 3.0, 1.0))
|
| 78 |
+
output = "\n".join(
|
| 79 |
+
part
|
| 80 |
+
for part in [
|
| 81 |
+
starter_run.stdout.strip(),
|
| 82 |
+
starter_run.stderr.strip(),
|
| 83 |
+
candidate_run.stdout.strip(),
|
| 84 |
+
candidate_run.stderr.strip(),
|
| 85 |
+
f"starter={starter_elapsed:.6f}s candidate={candidate_elapsed:.6f}s speedup={speedup:.2f}x",
|
| 86 |
+
]
|
| 87 |
+
if part
|
| 88 |
+
)
|
| 89 |
+
return runtime_score, False, output
|
| 90 |
+
except Exception as exc: # pragma: no cover
|
| 91 |
+
return 0.0, False, str(exc)
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def ast_quality_score(code: str, task: TaskSpec) -> float:
|
| 95 |
+
"""Score maintainability and algorithmic structure."""
|
| 96 |
+
|
| 97 |
+
tree, parse_error = compile_tree(code)
|
| 98 |
+
if tree is None:
|
| 99 |
+
return 0.0
|
| 100 |
+
|
| 101 |
+
import ast
|
| 102 |
+
|
| 103 |
+
function_node = next(
|
| 104 |
+
(node for node in tree.body if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef))),
|
| 105 |
+
None,
|
| 106 |
+
)
|
| 107 |
+
docstring_points = 0.2 if function_node and ast.get_docstring(function_node, clean=False) else 0.0
|
| 108 |
+
nested_points = 0.4 if nested_loop_depth(tree) <= 1 else 0.0
|
| 109 |
+
marker_points = 0.0
|
| 110 |
+
for marker in task.expected_quality_markers:
|
| 111 |
+
if marker in code:
|
| 112 |
+
marker_points += 0.2
|
| 113 |
+
return clamp_score(docstring_points + nested_points + marker_points)
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def grade_optimization_task(candidate_code: str, task: TaskSpec) -> TaskGrade:
|
| 117 |
+
"""Grade optimization tasks using correctness, runtime, AST quality, and style."""
|
| 118 |
+
|
| 119 |
+
execution = run_pytest_suite(
|
| 120 |
+
candidate_code,
|
| 121 |
+
[*task.visible_tests, *task.hidden_tests],
|
| 122 |
+
timeout_s=task.benchmark_timeout_s,
|
| 123 |
+
)
|
| 124 |
+
test_fraction = execution.passed / execution.total if execution.total else 0.0
|
| 125 |
+
|
| 126 |
+
if execution.timed_out:
|
| 127 |
+
return TaskGrade(
|
| 128 |
+
score=0.0,
|
| 129 |
+
tests_passed=execution.passed,
|
| 130 |
+
tests_total=execution.total,
|
| 131 |
+
timed_out=True,
|
| 132 |
+
details={"tests": execution.output},
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
runtime_score, timed_out, benchmark_output = benchmark_runtime(candidate_code, task)
|
| 136 |
+
if timed_out:
|
| 137 |
+
return TaskGrade(
|
| 138 |
+
score=0.0,
|
| 139 |
+
tests_passed=execution.passed,
|
| 140 |
+
tests_total=execution.total,
|
| 141 |
+
timed_out=True,
|
| 142 |
+
details={"tests": execution.output, "benchmark": benchmark_output},
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
quality_score = ast_quality_score(candidate_code, task)
|
| 146 |
+
pep8_score = style_score(candidate_code, task.style_max_line_length)
|
| 147 |
+
score = clamp_score(
|
| 148 |
+
(0.5 * test_fraction)
|
| 149 |
+
+ (0.3 * runtime_score)
|
| 150 |
+
+ (0.15 * quality_score)
|
| 151 |
+
+ (0.05 * pep8_score)
|
| 152 |
+
)
|
| 153 |
+
return TaskGrade(
|
| 154 |
+
score=score,
|
| 155 |
+
syntax_score=1.0,
|
| 156 |
+
tests_passed=execution.passed,
|
| 157 |
+
tests_total=execution.total,
|
| 158 |
+
quality_score=quality_score,
|
| 159 |
+
runtime_score=runtime_score,
|
| 160 |
+
details={
|
| 161 |
+
"tests": execution.output,
|
| 162 |
+
"benchmark": benchmark_output,
|
| 163 |
+
"test_fraction": round(test_fraction, 4),
|
| 164 |
+
"runtime_score": round(runtime_score, 4),
|
| 165 |
+
"style_score": round(pep8_score, 4),
|
| 166 |
+
},
|
| 167 |
+
)
|
graders/pytest_runner.py
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Helpers for deterministic pytest execution in temp sandboxes."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
import subprocess
|
| 7 |
+
import sys
|
| 8 |
+
import tempfile
|
| 9 |
+
from dataclasses import dataclass
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Iterable
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@dataclass(frozen=True)
|
| 15 |
+
class PytestExecution:
|
| 16 |
+
"""Exact pytest execution summary."""
|
| 17 |
+
|
| 18 |
+
passed: int
|
| 19 |
+
failed: int
|
| 20 |
+
total: int
|
| 21 |
+
timed_out: bool
|
| 22 |
+
output: str
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def _test_module_source(tests: Iterable[str]) -> str:
|
| 26 |
+
"""Build a valid pytest module from expression-style or full test snippets."""
|
| 27 |
+
blocks: list[str] = ["from candidate import * # noqa: F401,F403"]
|
| 28 |
+
for index, test in enumerate(tests, start=1):
|
| 29 |
+
snippet = str(test).strip()
|
| 30 |
+
if not snippet:
|
| 31 |
+
continue
|
| 32 |
+
if snippet.startswith("def test_"):
|
| 33 |
+
blocks.append(snippet)
|
| 34 |
+
continue
|
| 35 |
+
blocks.append(
|
| 36 |
+
"\n".join(
|
| 37 |
+
[
|
| 38 |
+
f"def test_case_{index:03d}():",
|
| 39 |
+
f" assert {snippet}",
|
| 40 |
+
]
|
| 41 |
+
)
|
| 42 |
+
)
|
| 43 |
+
return "\n\n".join(blocks) or "def test_placeholder():\n assert True\n"
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def _runner_script() -> str:
|
| 47 |
+
return """import json
|
| 48 |
+
import pathlib
|
| 49 |
+
import pytest
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class Collector:
|
| 53 |
+
def __init__(self) -> None:
|
| 54 |
+
self.passed = 0
|
| 55 |
+
self.failed = 0
|
| 56 |
+
|
| 57 |
+
def pytest_runtest_logreport(self, report):
|
| 58 |
+
if report.when != "call":
|
| 59 |
+
return
|
| 60 |
+
if report.passed:
|
| 61 |
+
self.passed += 1
|
| 62 |
+
elif report.failed:
|
| 63 |
+
self.failed += 1
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
collector = Collector()
|
| 67 |
+
exit_code = pytest.main(["-q", "test_candidate.py"], plugins=[collector])
|
| 68 |
+
payload = {
|
| 69 |
+
"passed": collector.passed,
|
| 70 |
+
"failed": collector.failed,
|
| 71 |
+
"exit_code": int(exit_code),
|
| 72 |
+
}
|
| 73 |
+
pathlib.Path("pytest_results.json").write_text(json.dumps(payload), encoding="utf-8")
|
| 74 |
+
"""
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def run_pytest_suite(candidate_code: str, tests: Iterable[str], timeout_s: float = 3.0) -> PytestExecution:
|
| 78 |
+
"""Run a pytest suite against candidate.py and return structured results."""
|
| 79 |
+
|
| 80 |
+
test_cases = list(tests)
|
| 81 |
+
try:
|
| 82 |
+
with tempfile.TemporaryDirectory(prefix="python-code-review-") as temp_dir:
|
| 83 |
+
temp_path = Path(temp_dir)
|
| 84 |
+
(temp_path / "candidate.py").write_text(candidate_code, encoding="utf-8")
|
| 85 |
+
(temp_path / "test_candidate.py").write_text(_test_module_source(test_cases), encoding="utf-8")
|
| 86 |
+
(temp_path / "runner.py").write_text(_runner_script(), encoding="utf-8")
|
| 87 |
+
|
| 88 |
+
try:
|
| 89 |
+
completed = subprocess.run(
|
| 90 |
+
[sys.executable, "runner.py"],
|
| 91 |
+
cwd=temp_path,
|
| 92 |
+
capture_output=True,
|
| 93 |
+
text=True,
|
| 94 |
+
timeout=timeout_s,
|
| 95 |
+
check=False,
|
| 96 |
+
)
|
| 97 |
+
except subprocess.TimeoutExpired as exc:
|
| 98 |
+
output = (exc.stdout or "") + (exc.stderr or "")
|
| 99 |
+
return PytestExecution(
|
| 100 |
+
passed=0,
|
| 101 |
+
failed=max(len(test_cases), 1),
|
| 102 |
+
total=max(len(test_cases), 1),
|
| 103 |
+
timed_out=True,
|
| 104 |
+
output=(output or "pytest timed out").strip(),
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
result_path = temp_path / "pytest_results.json"
|
| 108 |
+
if not result_path.exists():
|
| 109 |
+
output = (completed.stdout or "") + (completed.stderr or "")
|
| 110 |
+
total = max(len(test_cases), 1)
|
| 111 |
+
return PytestExecution(
|
| 112 |
+
passed=0,
|
| 113 |
+
failed=total,
|
| 114 |
+
total=total,
|
| 115 |
+
timed_out=False,
|
| 116 |
+
output=output.strip(),
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
try:
|
| 120 |
+
payload = json.loads(result_path.read_text(encoding="utf-8"))
|
| 121 |
+
except Exception as exc:
|
| 122 |
+
output = ((completed.stdout or "") + (completed.stderr or "")).strip()
|
| 123 |
+
return PytestExecution(
|
| 124 |
+
passed=0,
|
| 125 |
+
failed=max(len(test_cases), 1),
|
| 126 |
+
total=max(len(test_cases), 1),
|
| 127 |
+
timed_out=False,
|
| 128 |
+
output=(output or str(exc)).strip(),
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
passed = int(payload.get("passed", 0))
|
| 132 |
+
failed = int(payload.get("failed", 0))
|
| 133 |
+
total = max(passed + failed, len(test_cases))
|
| 134 |
+
output = ((completed.stdout or "") + (completed.stderr or "")).strip()
|
| 135 |
+
return PytestExecution(
|
| 136 |
+
passed=passed,
|
| 137 |
+
failed=failed,
|
| 138 |
+
total=total,
|
| 139 |
+
timed_out=False,
|
| 140 |
+
output=output,
|
| 141 |
+
)
|
| 142 |
+
except Exception as exc:
|
| 143 |
+
return PytestExecution(
|
| 144 |
+
passed=0,
|
| 145 |
+
failed=max(len(test_cases), 1),
|
| 146 |
+
total=max(len(test_cases), 1),
|
| 147 |
+
timed_out=False,
|
| 148 |
+
output=str(exc),
|
| 149 |
+
)
|
graders/syntax.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Task graders for syntax and bug-fix tasks."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from graders.common import clamp_score, compiles, normalized_diff_score, style_score, syntax_error_message
|
| 6 |
+
from graders.optimization import grade_optimization_task
|
| 7 |
+
from graders.pytest_runner import run_pytest_suite
|
| 8 |
+
from models import TaskGrade
|
| 9 |
+
from tasks.task_bank import TaskSpec
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def grade_syntax_task(candidate_code: str, task: TaskSpec) -> TaskGrade:
|
| 13 |
+
"""Grade syntax repair tasks with partial credit for progress toward the reference."""
|
| 14 |
+
|
| 15 |
+
error = syntax_error_message(candidate_code)
|
| 16 |
+
diff_score = normalized_diff_score(candidate_code, task.reference_code)
|
| 17 |
+
style_base = style_score(candidate_code, task.style_max_line_length)
|
| 18 |
+
|
| 19 |
+
if not error:
|
| 20 |
+
return TaskGrade(
|
| 21 |
+
score=1.0,
|
| 22 |
+
syntax_score=1.0,
|
| 23 |
+
quality_score=style_base,
|
| 24 |
+
details={"compile_error": ""},
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
partial = clamp_score(0.15 + (0.55 * diff_score))
|
| 28 |
+
return TaskGrade(
|
| 29 |
+
score=partial,
|
| 30 |
+
syntax_score=0.0,
|
| 31 |
+
quality_score=diff_score * style_base,
|
| 32 |
+
details={"compile_error": error},
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def grade_bug_fix_task(candidate_code: str, task: TaskSpec, include_hidden: bool = True) -> TaskGrade:
|
| 37 |
+
"""Grade logic bug tasks with pytest pass fraction."""
|
| 38 |
+
|
| 39 |
+
if not compiles(candidate_code):
|
| 40 |
+
error = syntax_error_message(candidate_code)
|
| 41 |
+
return TaskGrade(score=0.0, syntax_score=0.0, details={"compile_error": error})
|
| 42 |
+
|
| 43 |
+
tests = list(task.visible_tests)
|
| 44 |
+
if include_hidden:
|
| 45 |
+
tests.extend(task.hidden_tests)
|
| 46 |
+
|
| 47 |
+
execution = run_pytest_suite(candidate_code, tests, timeout_s=3.0)
|
| 48 |
+
if execution.timed_out:
|
| 49 |
+
return TaskGrade(
|
| 50 |
+
score=0.0,
|
| 51 |
+
syntax_score=1.0,
|
| 52 |
+
tests_passed=execution.passed,
|
| 53 |
+
tests_total=execution.total,
|
| 54 |
+
timed_out=True,
|
| 55 |
+
details={"compile_error": "", "tests": execution.output},
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
pass_fraction = execution.passed / execution.total if execution.total else 0.0
|
| 59 |
+
quality = style_score(candidate_code, task.style_max_line_length)
|
| 60 |
+
|
| 61 |
+
return TaskGrade(
|
| 62 |
+
score=clamp_score(pass_fraction),
|
| 63 |
+
syntax_score=1.0,
|
| 64 |
+
tests_passed=execution.passed,
|
| 65 |
+
tests_total=execution.total,
|
| 66 |
+
quality_score=quality,
|
| 67 |
+
details={"compile_error": "", "tests": execution.output},
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def grade_task(candidate_code: str, task: TaskSpec, include_hidden: bool = True) -> TaskGrade:
|
| 72 |
+
"""Dispatch to the correct deterministic grader for one task."""
|
| 73 |
+
|
| 74 |
+
if task.task_kind == "syntax_fix":
|
| 75 |
+
return grade_syntax_task(candidate_code, task)
|
| 76 |
+
if task.task_kind == "bug_fix":
|
| 77 |
+
return grade_bug_fix_task(candidate_code, task, include_hidden=include_hidden)
|
| 78 |
+
return grade_optimization_task(candidate_code, task)
|
inference.py
CHANGED
|
@@ -1,314 +1,462 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
import
|
| 14 |
-
import
|
| 15 |
-
|
| 16 |
-
from
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
from openai import OpenAI
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
def
|
| 84 |
-
"""
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
return
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
"
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
)
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
if
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
"
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Fail-safe inference entrypoint for the Python code review environment."""
|
| 3 |
+
|
| 4 |
+
from __future__ import annotations
|
| 5 |
+
|
| 6 |
+
import io
|
| 7 |
+
import json
|
| 8 |
+
import os
|
| 9 |
+
import subprocess
|
| 10 |
+
import sys
|
| 11 |
+
import time
|
| 12 |
+
from collections.abc import Iterable
|
| 13 |
+
from contextlib import redirect_stderr, redirect_stdout
|
| 14 |
+
from typing import Any, Dict, Optional
|
| 15 |
+
|
| 16 |
+
from compat import install_openenv_fastmcp_compat
|
| 17 |
+
|
| 18 |
+
try:
|
| 19 |
+
from openai import OpenAI
|
| 20 |
+
except Exception:
|
| 21 |
+
OpenAI = None # type: ignore[assignment]
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
install_openenv_fastmcp_compat()
|
| 25 |
+
|
| 26 |
+
try:
|
| 27 |
+
from server.env import PythonCodeReviewEnvironment
|
| 28 |
+
except Exception:
|
| 29 |
+
PythonCodeReviewEnvironment = None # type: ignore[assignment]
|
| 30 |
+
|
| 31 |
+
try:
|
| 32 |
+
from models import PythonCodeReviewAction
|
| 33 |
+
except Exception:
|
| 34 |
+
PythonCodeReviewAction = None # type: ignore[assignment]
|
| 35 |
+
|
| 36 |
+
try:
|
| 37 |
+
from tasks import task_ids
|
| 38 |
+
except Exception:
|
| 39 |
+
task_ids = None # type: ignore[assignment]
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
ALLOWED_ACTIONS = {
|
| 43 |
+
"analyze_code",
|
| 44 |
+
"edit_code",
|
| 45 |
+
"run_tests",
|
| 46 |
+
"submit_solution",
|
| 47 |
+
}
|
| 48 |
+
DEFAULT_MODEL_NAME = "mock-model"
|
| 49 |
+
DEFAULT_ACTION = {"action_type": "analyze_code", "code": None, "fallback_reason": "mock_response"}
|
| 50 |
+
API_TIMEOUT_SECONDS = 3.0
|
| 51 |
+
API_RETRIES = 1
|
| 52 |
+
API_RETRY_DELAY_SECONDS = 0.2
|
| 53 |
+
MAX_STEPS = 2
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def safe_env(name: str, default: str = "") -> str:
|
| 57 |
+
"""Read an allowed environment variable and return a safe string default."""
|
| 58 |
+
try:
|
| 59 |
+
value = os.getenv(name)
|
| 60 |
+
if value is None:
|
| 61 |
+
return default
|
| 62 |
+
return str(value)
|
| 63 |
+
except Exception:
|
| 64 |
+
return default
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def clamp(value: float, low: float = 0.0, high: float = 1.0) -> float:
|
| 68 |
+
"""Clamp a numeric value to a bounded range."""
|
| 69 |
+
try:
|
| 70 |
+
return max(low, min(high, float(value)))
|
| 71 |
+
except Exception:
|
| 72 |
+
return low
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def safe_float(value: Any, default: float = 0.0) -> float:
|
| 76 |
+
"""Convert a value to float without raising."""
|
| 77 |
+
try:
|
| 78 |
+
return float(value)
|
| 79 |
+
except Exception:
|
| 80 |
+
return default
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def safe_text(value: Any, default: str = "") -> str:
|
| 84 |
+
"""Convert any value into a bounded, printable string."""
|
| 85 |
+
try:
|
| 86 |
+
text = str(value)
|
| 87 |
+
except Exception:
|
| 88 |
+
return default
|
| 89 |
+
text = " ".join(text.split())
|
| 90 |
+
return text[:160] if text else default
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def safe_getattr(obj: Any, name: str, default: Any = None) -> Any:
|
| 94 |
+
"""Fetch an attribute from an object without raising."""
|
| 95 |
+
try:
|
| 96 |
+
return getattr(obj, name, default)
|
| 97 |
+
except Exception:
|
| 98 |
+
return default
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def parse_json_response(raw_text: str) -> Dict[str, Any]:
|
| 102 |
+
"""Parse model output into a safe action payload with deterministic fallback."""
|
| 103 |
+
try:
|
| 104 |
+
text = raw_text or ""
|
| 105 |
+
start = text.find("{")
|
| 106 |
+
end = text.rfind("}") + 1
|
| 107 |
+
if start >= 0 and end > start:
|
| 108 |
+
payload = json.loads(text[start:end])
|
| 109 |
+
if isinstance(payload, dict):
|
| 110 |
+
action_type = payload.get("action_type", DEFAULT_ACTION["action_type"])
|
| 111 |
+
code = payload.get("code")
|
| 112 |
+
if action_type not in ALLOWED_ACTIONS:
|
| 113 |
+
action_type = DEFAULT_ACTION["action_type"]
|
| 114 |
+
if action_type != "edit_code":
|
| 115 |
+
code = None
|
| 116 |
+
return {
|
| 117 |
+
"action_type": action_type,
|
| 118 |
+
"code": code,
|
| 119 |
+
"fallback_reason": "",
|
| 120 |
+
}
|
| 121 |
+
except Exception:
|
| 122 |
+
pass
|
| 123 |
+
return dict(DEFAULT_ACTION)
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def build_prompt(observation: Any) -> str:
|
| 127 |
+
"""Build a short prompt from the current observation with safe defaults."""
|
| 128 |
+
try:
|
| 129 |
+
task_description = safe_text(safe_getattr(observation, "task_description", ""), "No task description.")
|
| 130 |
+
current_code = safe_text(safe_getattr(observation, "current_code", ""), "")
|
| 131 |
+
errors = safe_text(safe_getattr(observation, "errors", ""), "")
|
| 132 |
+
tests = safe_text(safe_getattr(observation, "test_results", ""), "")
|
| 133 |
+
score = clamp(safe_getattr(observation, "score", 0.0))
|
| 134 |
+
visible_tests = safe_getattr(observation, "visible_tests", [])
|
| 135 |
+
if not isinstance(visible_tests, Iterable) or isinstance(visible_tests, (str, bytes)):
|
| 136 |
+
visible_tests = []
|
| 137 |
+
visible_lines = []
|
| 138 |
+
for item in list(visible_tests)[:4]:
|
| 139 |
+
visible_lines.append(f"- {safe_text(item, 'unknown test')}")
|
| 140 |
+
visible_block = "\n".join(visible_lines) if visible_lines else "- none"
|
| 141 |
+
return (
|
| 142 |
+
"Return exactly one JSON object with keys action_type and optional code.\n"
|
| 143 |
+
"Allowed action_type values: analyze_code, edit_code, run_tests, submit_solution.\n"
|
| 144 |
+
f"Task: {task_description}\n"
|
| 145 |
+
f"Score: {score:.3f}\n"
|
| 146 |
+
f"Errors: {errors or 'none'}\n"
|
| 147 |
+
f"Tests: {tests or 'not available'}\n"
|
| 148 |
+
f"Visible tests:\n{visible_block}\n"
|
| 149 |
+
f"Code:\n{current_code}\n"
|
| 150 |
+
)
|
| 151 |
+
except Exception:
|
| 152 |
+
return (
|
| 153 |
+
"Return exactly one JSON object with keys action_type and optional code. "
|
| 154 |
+
"Use action_type analyze_code."
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def create_client() -> Optional[Any]:
|
| 159 |
+
"""Create an OpenAI-compatible client using only the allowed environment variables."""
|
| 160 |
+
if OpenAI is None:
|
| 161 |
+
return None
|
| 162 |
+
base_url = safe_env("API_BASE_URL", "")
|
| 163 |
+
if not base_url:
|
| 164 |
+
return None
|
| 165 |
+
try:
|
| 166 |
+
if safe_env("HF_TOKEN", ""):
|
| 167 |
+
os.environ["OPENAI_API_KEY"] = safe_env("HF_TOKEN", "")
|
| 168 |
+
except Exception:
|
| 169 |
+
pass
|
| 170 |
+
try:
|
| 171 |
+
client = OpenAI(base_url=os.getenv("API_BASE_URL"))
|
| 172 |
+
return client
|
| 173 |
+
except Exception:
|
| 174 |
+
return None
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
def run_llm(client: Optional[Any], model: str, prompt: str) -> Dict[str, Any]:
|
| 178 |
+
"""Call the LLM with timeout and retry, then fall back to a mock action."""
|
| 179 |
+
if client is None:
|
| 180 |
+
fallback = dict(DEFAULT_ACTION)
|
| 181 |
+
fallback["fallback_reason"] = "client_unavailable"
|
| 182 |
+
return fallback
|
| 183 |
+
|
| 184 |
+
last_reason = "llm_unavailable"
|
| 185 |
+
for attempt in range(API_RETRIES + 1):
|
| 186 |
+
try:
|
| 187 |
+
with redirect_stdout(io.StringIO()), redirect_stderr(io.StringIO()):
|
| 188 |
+
response = client.with_options(timeout=API_TIMEOUT_SECONDS).chat.completions.create(
|
| 189 |
+
model=model,
|
| 190 |
+
messages=[{"role": "user", "content": prompt}],
|
| 191 |
+
temperature=0,
|
| 192 |
+
max_tokens=300,
|
| 193 |
+
)
|
| 194 |
+
message = safe_getattr(response.choices[0].message, "content", "")
|
| 195 |
+
parsed = parse_json_response(message)
|
| 196 |
+
if parsed.get("fallback_reason"):
|
| 197 |
+
parsed["fallback_reason"] = "parse_failed"
|
| 198 |
+
return parsed
|
| 199 |
+
except Exception as exc:
|
| 200 |
+
last_reason = safe_text(exc, "llm_error").lower().replace(" ", "_")
|
| 201 |
+
if attempt < API_RETRIES:
|
| 202 |
+
try:
|
| 203 |
+
time.sleep(API_RETRY_DELAY_SECONDS * (attempt + 1))
|
| 204 |
+
except Exception:
|
| 205 |
+
pass
|
| 206 |
+
|
| 207 |
+
fallback = dict(DEFAULT_ACTION)
|
| 208 |
+
fallback["fallback_reason"] = last_reason[:48] or "llm_retry_exhausted"
|
| 209 |
+
return fallback
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
def probe_docker(image_name: str) -> Dict[str, Any]:
|
| 213 |
+
"""Safely validate Docker connectivity when a local image name is provided."""
|
| 214 |
+
if not image_name:
|
| 215 |
+
return {"checked": False, "available": False, "reason": "docker_skip"}
|
| 216 |
+
try:
|
| 217 |
+
with redirect_stdout(io.StringIO()), redirect_stderr(io.StringIO()):
|
| 218 |
+
result = subprocess.run(
|
| 219 |
+
["docker", "image", "inspect", image_name],
|
| 220 |
+
capture_output=True,
|
| 221 |
+
text=True,
|
| 222 |
+
timeout=3,
|
| 223 |
+
check=False,
|
| 224 |
+
)
|
| 225 |
+
if result.returncode == 0:
|
| 226 |
+
return {"checked": True, "available": True, "reason": "docker_ok"}
|
| 227 |
+
return {"checked": True, "available": False, "reason": "docker_unreachable"}
|
| 228 |
+
except Exception as exc:
|
| 229 |
+
return {"checked": True, "available": False, "reason": safe_text(exc, "docker_error").lower().replace(" ", "_")}
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
def fallback_step_result(reason: str, docker_status: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
| 233 |
+
"""Return a deterministic dummy step result when environment execution fails."""
|
| 234 |
+
docker_reason = safe_text((docker_status or {}).get("reason", "docker_skip"), "docker_skip")
|
| 235 |
+
short_reason = safe_text(reason, "env_fallback").lower().replace(" ", "_")
|
| 236 |
+
return {
|
| 237 |
+
"status": "ok",
|
| 238 |
+
"fallback": True,
|
| 239 |
+
"reason": short_reason[:64],
|
| 240 |
+
"reward": 0.0,
|
| 241 |
+
"improvement": 0.0,
|
| 242 |
+
"score": 0.0,
|
| 243 |
+
"done": True,
|
| 244 |
+
"docker": docker_reason[:32],
|
| 245 |
+
}
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
def safe_task_list() -> list[str]:
|
| 249 |
+
"""Load task identifiers without raising."""
|
| 250 |
+
try:
|
| 251 |
+
if callable(task_ids):
|
| 252 |
+
loaded = list(task_ids())
|
| 253 |
+
if loaded:
|
| 254 |
+
return [safe_text(item, "fallback-task") for item in loaded]
|
| 255 |
+
except Exception:
|
| 256 |
+
pass
|
| 257 |
+
return ["fallback-task"]
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
def make_action(action_payload: Dict[str, Any]) -> Any:
|
| 261 |
+
"""Build a validated environment action or a safe placeholder."""
|
| 262 |
+
action_type = action_payload.get("action_type", DEFAULT_ACTION["action_type"])
|
| 263 |
+
if action_type not in ALLOWED_ACTIONS:
|
| 264 |
+
action_type = DEFAULT_ACTION["action_type"]
|
| 265 |
+
code = action_payload.get("code")
|
| 266 |
+
if action_type != "edit_code":
|
| 267 |
+
code = None
|
| 268 |
+
if PythonCodeReviewAction is None:
|
| 269 |
+
return {"action_type": action_type, "code": code}
|
| 270 |
+
try:
|
| 271 |
+
return PythonCodeReviewAction(action_type=action_type, code=code)
|
| 272 |
+
except Exception:
|
| 273 |
+
try:
|
| 274 |
+
return PythonCodeReviewAction(action_type=DEFAULT_ACTION["action_type"], code=None)
|
| 275 |
+
except Exception:
|
| 276 |
+
return {"action_type": DEFAULT_ACTION["action_type"], "code": None}
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
def compute_reward(
|
| 280 |
+
previous_score: float,
|
| 281 |
+
current_score: float,
|
| 282 |
+
step_reward: float,
|
| 283 |
+
used_fallback: bool,
|
| 284 |
+
done: bool,
|
| 285 |
+
) -> Dict[str, float]:
|
| 286 |
+
"""Compute a deterministic dynamic reward and improvement metric."""
|
| 287 |
+
prev_value = clamp(previous_score)
|
| 288 |
+
curr_value = clamp(current_score)
|
| 289 |
+
improvement = round(curr_value - prev_value, 4)
|
| 290 |
+
bounded_step_reward = max(-1.0, min(1.0, safe_float(step_reward, 0.0)))
|
| 291 |
+
reward_value = (
|
| 292 |
+
0.55 * curr_value
|
| 293 |
+
+ 0.30 * max(improvement, 0.0)
|
| 294 |
+
+ 0.10 * max(bounded_step_reward, 0.0)
|
| 295 |
+
+ (0.05 if done and curr_value >= 0.99 else 0.0)
|
| 296 |
+
- (0.05 if used_fallback else 0.0)
|
| 297 |
+
)
|
| 298 |
+
return {
|
| 299 |
+
"reward": round(clamp(reward_value), 4),
|
| 300 |
+
"improvement": improvement,
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
|
| 304 |
+
def safe_step(env: Any, action: Any) -> Any:
|
| 305 |
+
"""Execute one environment step without allowing stdout leaks or exceptions."""
|
| 306 |
+
try:
|
| 307 |
+
with redirect_stdout(io.StringIO()), redirect_stderr(io.StringIO()):
|
| 308 |
+
return env.step(action)
|
| 309 |
+
except Exception:
|
| 310 |
+
return None
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
def safe_reset(env: Any, task_id: str) -> Any:
|
| 314 |
+
"""Reset the environment safely for a task."""
|
| 315 |
+
try:
|
| 316 |
+
with redirect_stdout(io.StringIO()), redirect_stderr(io.StringIO()):
|
| 317 |
+
return env.reset(task_id=task_id)
|
| 318 |
+
except Exception:
|
| 319 |
+
return None
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
def run_env(client: Optional[Any], model: str) -> Dict[str, Any]:
|
| 323 |
+
"""Run the environment loop safely and return a structured result payload."""
|
| 324 |
+
docker_status = probe_docker(safe_env("LOCAL_IMAGE_NAME", ""))
|
| 325 |
+
if PythonCodeReviewEnvironment is None:
|
| 326 |
+
return fallback_step_result("env_import_failed", docker_status)
|
| 327 |
+
|
| 328 |
+
try:
|
| 329 |
+
with redirect_stdout(io.StringIO()), redirect_stderr(io.StringIO()):
|
| 330 |
+
env = PythonCodeReviewEnvironment(verbose=False)
|
| 331 |
+
except Exception as exc:
|
| 332 |
+
return fallback_step_result(f"env_init_failed_{safe_text(exc, 'unknown')}", docker_status)
|
| 333 |
+
|
| 334 |
+
tasks = safe_task_list()
|
| 335 |
+
task_id = tasks[0] if tasks else "fallback-task"
|
| 336 |
+
observation = safe_reset(env, task_id)
|
| 337 |
+
if observation is None:
|
| 338 |
+
return fallback_step_result("env_reset_failed", docker_status)
|
| 339 |
+
|
| 340 |
+
previous_score = clamp(safe_getattr(observation, "score", 0.0))
|
| 341 |
+
total_step_reward = 0.0
|
| 342 |
+
used_fallback = False
|
| 343 |
+
final_status = "ok"
|
| 344 |
+
final_reason = "completed"
|
| 345 |
+
final_observation = observation
|
| 346 |
+
|
| 347 |
+
for step_index in range(MAX_STEPS):
|
| 348 |
+
prompt = build_prompt(final_observation)
|
| 349 |
+
action_payload = run_llm(client, model, prompt)
|
| 350 |
+
used_fallback = used_fallback or bool(action_payload.get("fallback_reason"))
|
| 351 |
+
action = make_action(action_payload)
|
| 352 |
+
next_observation = safe_step(env, action)
|
| 353 |
+
if next_observation is None:
|
| 354 |
+
final_status = "ok"
|
| 355 |
+
final_reason = "env_step_fallback"
|
| 356 |
+
used_fallback = True
|
| 357 |
+
break
|
| 358 |
+
|
| 359 |
+
final_observation = next_observation
|
| 360 |
+
total_step_reward += safe_float(safe_getattr(final_observation, "reward", 0.0), 0.0)
|
| 361 |
+
done = bool(safe_getattr(final_observation, "done", False))
|
| 362 |
+
score = clamp(safe_getattr(final_observation, "score", 0.0))
|
| 363 |
+
if safe_getattr(final_observation, "last_action_status", ""):
|
| 364 |
+
final_reason = safe_text(safe_getattr(final_observation, "last_action_status", ""), "step_completed")
|
| 365 |
+
elif action_payload.get("fallback_reason"):
|
| 366 |
+
final_reason = safe_text(action_payload.get("fallback_reason"), "llm_fallback")
|
| 367 |
+
else:
|
| 368 |
+
final_reason = f"step_{step_index + 1}_completed"
|
| 369 |
+
if done:
|
| 370 |
+
break
|
| 371 |
+
|
| 372 |
+
if step_index == 0:
|
| 373 |
+
submit_action = make_action({"action_type": "submit_solution", "code": None})
|
| 374 |
+
submitted_observation = safe_step(env, submit_action)
|
| 375 |
+
if submitted_observation is None:
|
| 376 |
+
final_reason = "submit_fallback"
|
| 377 |
+
used_fallback = True
|
| 378 |
+
break
|
| 379 |
+
final_observation = submitted_observation
|
| 380 |
+
total_step_reward += safe_float(safe_getattr(final_observation, "reward", 0.0), 0.0)
|
| 381 |
+
if safe_getattr(final_observation, "last_action_status", ""):
|
| 382 |
+
final_reason = safe_text(safe_getattr(final_observation, "last_action_status", ""), "submit_completed")
|
| 383 |
+
break
|
| 384 |
+
|
| 385 |
+
current_score = clamp(safe_getattr(final_observation, "score", previous_score))
|
| 386 |
+
done = bool(safe_getattr(final_observation, "done", True))
|
| 387 |
+
metrics = compute_reward(
|
| 388 |
+
previous_score=previous_score,
|
| 389 |
+
current_score=current_score,
|
| 390 |
+
step_reward=total_step_reward,
|
| 391 |
+
used_fallback=used_fallback,
|
| 392 |
+
done=done,
|
| 393 |
+
)
|
| 394 |
+
return {
|
| 395 |
+
"status": final_status,
|
| 396 |
+
"fallback": used_fallback,
|
| 397 |
+
"reason": safe_text(final_reason, "completed").lower().replace(" ", "_")[:64],
|
| 398 |
+
"reward": metrics["reward"],
|
| 399 |
+
"improvement": metrics["improvement"],
|
| 400 |
+
"score": round(current_score, 4),
|
| 401 |
+
"done": done,
|
| 402 |
+
"docker": safe_text(docker_status.get("reason", "docker_skip"), "docker_skip")[:32],
|
| 403 |
+
}
|
| 404 |
+
|
| 405 |
+
|
| 406 |
+
def format_step_message(result: Dict[str, Any]) -> str:
|
| 407 |
+
"""Format the only allowed STEP line for stdout."""
|
| 408 |
+
try:
|
| 409 |
+
fallback = bool(result.get("fallback", False))
|
| 410 |
+
reason = safe_text(result.get("reason", "completed"), "completed").lower().replace(" ", "_")
|
| 411 |
+
if fallback:
|
| 412 |
+
reward = safe_float(result.get("reward", 0.0), 0.0)
|
| 413 |
+
improvement = safe_float(result.get("improvement", 0.0), 0.0)
|
| 414 |
+
score = safe_float(result.get("score", 0.0), 0.0)
|
| 415 |
+
status = safe_text(result.get("status", "ok"), "ok").lower().replace(" ", "_")
|
| 416 |
+
return (
|
| 417 |
+
f"error handled: {reason} reward={reward:.4f} status={status} "
|
| 418 |
+
f"fallback=true improvement={improvement:.4f} score={score:.4f}"
|
| 419 |
+
)
|
| 420 |
+
reward = safe_float(result.get("reward", 0.0), 0.0)
|
| 421 |
+
improvement = safe_float(result.get("improvement", 0.0), 0.0)
|
| 422 |
+
score = safe_float(result.get("score", 0.0), 0.0)
|
| 423 |
+
status = safe_text(result.get("status", "ok"), "ok").lower().replace(" ", "_")
|
| 424 |
+
return (
|
| 425 |
+
f"reward={reward:.4f} status={status} "
|
| 426 |
+
f"fallback=false improvement={improvement:.4f} score={score:.4f}"
|
| 427 |
+
)
|
| 428 |
+
except Exception:
|
| 429 |
+
return "error handled: formatting_failed"
|
| 430 |
+
|
| 431 |
+
|
| 432 |
+
def main() -> int:
|
| 433 |
+
"""Run the inference workflow and always terminate successfully."""
|
| 434 |
+
step_message = "error handled: initialization_failed"
|
| 435 |
+
try:
|
| 436 |
+
model_name = safe_env("MODEL_NAME", DEFAULT_MODEL_NAME) or DEFAULT_MODEL_NAME
|
| 437 |
+
client = create_client()
|
| 438 |
+
result = run_env(client, model_name)
|
| 439 |
+
step_message = format_step_message(result)
|
| 440 |
+
except BaseException as exc:
|
| 441 |
+
step_message = f"error handled: {safe_text(exc, 'unexpected_failure').lower().replace(' ', '_')[:64]}"
|
| 442 |
+
finally:
|
| 443 |
+
try:
|
| 444 |
+
print("START")
|
| 445 |
+
print(f"STEP: {step_message}")
|
| 446 |
+
print("END")
|
| 447 |
+
except Exception:
|
| 448 |
+
pass
|
| 449 |
+
return 0
|
| 450 |
+
|
| 451 |
+
|
| 452 |
+
if __name__ == "__main__":
|
| 453 |
+
try:
|
| 454 |
+
main()
|
| 455 |
+
except BaseException:
|
| 456 |
+
try:
|
| 457 |
+
print("START")
|
| 458 |
+
print("STEP: error handled: fatal_guard")
|
| 459 |
+
print("END")
|
| 460 |
+
except Exception:
|
| 461 |
+
pass
|
| 462 |
+
sys.exit(0)
|
models.py
CHANGED
|
@@ -1,217 +1,185 @@
|
|
| 1 |
-
"""Typed models for
|
| 2 |
|
| 3 |
-
|
| 4 |
|
| 5 |
-
|
| 6 |
-
- the REST API layer
|
| 7 |
-
- the benchmark grader
|
| 8 |
-
- the inference script
|
| 9 |
-
- the tests
|
| 10 |
-
|
| 11 |
-
Keeping these models centralized makes the environment easier to validate,
|
| 12 |
-
serialize, and evolve without each module inventing its own payload shape.
|
| 13 |
-
"""
|
| 14 |
-
|
| 15 |
-
from typing import List, Literal, Optional
|
| 16 |
|
| 17 |
from pydantic import BaseModel, Field
|
| 18 |
-
from openenv.core.env_server.types import Action, Observation
|
| 19 |
|
|
|
|
| 20 |
|
| 21 |
-
# Difficulty buckets are intentionally small and fixed so tasks can be
|
| 22 |
-
# grouped for curriculum learning and reporting without extra normalization.
|
| 23 |
-
Difficulty = Literal["easy", "medium", "hard"]
|
| 24 |
|
| 25 |
-
|
| 26 |
-
|
|
|
|
|
|
|
| 27 |
Severity = Literal["critical", "warning", "info"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
-
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
-
|
| 33 |
-
|
|
|
|
|
|
|
| 34 |
|
| 35 |
|
| 36 |
class ReviewFinding(BaseModel):
|
| 37 |
-
"""
|
| 38 |
-
|
| 39 |
-
Each finding is designed to be machine-gradable while still resembling the
|
| 40 |
-
sort of issue summary a human reviewer would write in a real code review.
|
| 41 |
-
"""
|
| 42 |
-
|
| 43 |
-
title: str = Field(..., description="Short title for the finding")
|
| 44 |
-
line: Optional[int] = Field(default=None, description="1-based source line number")
|
| 45 |
-
category: Category = Field(default="bug", description="Issue category")
|
| 46 |
-
severity: Severity = Field(default="warning", description="Issue severity")
|
| 47 |
-
rationale: str = Field(
|
| 48 |
-
default="",
|
| 49 |
-
description="Why the issue matters and how it affects behaviour or safety",
|
| 50 |
-
)
|
| 51 |
-
recommendation: Optional[str] = Field(
|
| 52 |
-
default=None, description="Concrete fix recommendation"
|
| 53 |
-
)
|
| 54 |
-
rule_id: Optional[str] = Field(
|
| 55 |
-
default=None,
|
| 56 |
-
description="Stable internal rule identifier when known",
|
| 57 |
-
)
|
| 58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
-
|
| 61 |
-
|
|
|
|
|
|
|
| 62 |
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
task_id: str = Field(..., description="Stable task identifier")
|
| 68 |
-
difficulty: Difficulty = Field(..., description="Task difficulty bucket")
|
| 69 |
-
title: str = Field(..., description="Short task title")
|
| 70 |
-
objective: str = Field(..., description="What the reviewer should accomplish")
|
| 71 |
-
code: str = Field(..., description="Python code to review")
|
| 72 |
-
max_steps: int = Field(..., ge=1, description="Maximum actions allowed")
|
| 73 |
-
success_threshold: float = Field(
|
| 74 |
-
..., ge=0.0, le=1.0, description="Minimum score considered a pass"
|
| 75 |
-
)
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
class TaskEvaluation(BaseModel):
|
| 79 |
-
"""Deterministic grader output.
|
| 80 |
-
|
| 81 |
-
This model is returned in observations and offline grading routes so that
|
| 82 |
-
both online interaction and offline evaluation use exactly the same metrics.
|
| 83 |
-
"""
|
| 84 |
-
|
| 85 |
-
matched_reference_ids: List[str] = Field(default_factory=list)
|
| 86 |
-
matched_findings: int = Field(default=0, ge=0)
|
| 87 |
-
total_findings: int = Field(default=0, ge=0)
|
| 88 |
-
false_positives: int = Field(default=0, ge=0)
|
| 89 |
-
duplicate_findings: int = Field(default=0, ge=0)
|
| 90 |
-
weighted_recall: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 91 |
-
patch_score: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 92 |
-
score: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 93 |
-
passed: bool = Field(default=False)
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
class PythonReviewAction(Action):
|
| 97 |
-
"""Action submitted by an agent during an episode.
|
| 98 |
-
|
| 99 |
-
The action space is kept intentionally small:
|
| 100 |
-
|
| 101 |
-
- `submit_findings` for intermediate progress
|
| 102 |
-
- `request_hint` when the agent needs guidance at a small penalty
|
| 103 |
-
- `finalize` when the agent wants the episode to end
|
| 104 |
-
"""
|
| 105 |
-
|
| 106 |
-
operation: Operation = Field(
|
| 107 |
-
default="submit_findings",
|
| 108 |
-
description="How to interact with the environment on this step",
|
| 109 |
-
)
|
| 110 |
-
findings: List[ReviewFinding] = Field(
|
| 111 |
-
default_factory=list,
|
| 112 |
-
description="Structured findings being submitted for grading",
|
| 113 |
-
)
|
| 114 |
-
patched_code: Optional[str] = Field(
|
| 115 |
-
default=None,
|
| 116 |
-
description="Optional improved version of the code under review",
|
| 117 |
-
)
|
| 118 |
-
note: Optional[str] = Field(
|
| 119 |
-
default=None,
|
| 120 |
-
description="Optional free-form reviewer note for logging or context",
|
| 121 |
-
)
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
class PythonEnvConfig(BaseModel):
|
| 125 |
-
"""Environment-level configuration knobs.
|
| 126 |
-
|
| 127 |
-
These values are useful for experimentation because they let you adjust
|
| 128 |
-
reward shaping and curriculum ordering without changing the grader logic.
|
| 129 |
-
"""
|
| 130 |
-
|
| 131 |
-
task_order: List[str] = Field(
|
| 132 |
-
default_factory=lambda: ["py-review-easy", "py-review-medium", "py-review-hard"],
|
| 133 |
-
description="Deterministic task order used across resets",
|
| 134 |
-
)
|
| 135 |
-
max_steps_per_task: int = Field(default=4, ge=1, le=10)
|
| 136 |
-
hint_penalty: float = Field(default=0.05, ge=0.0, le=1.0)
|
| 137 |
-
false_positive_penalty: float = Field(default=0.08, ge=0.0, le=1.0)
|
| 138 |
-
duplicate_penalty: float = Field(default=0.03, ge=0.0, le=1.0)
|
| 139 |
-
patch_bonus_multiplier: float = Field(default=0.2, ge=0.0, le=1.0)
|
| 140 |
-
max_history_entries: int = Field(default=50, ge=1, le=500)
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
class PythonReviewObservation(Observation):
|
| 144 |
-
"""Observation returned by `reset()` and `step()`.
|
| 145 |
-
|
| 146 |
-
The observation combines:
|
| 147 |
-
|
| 148 |
-
- visible task context
|
| 149 |
-
- immediate feedback on the previous action
|
| 150 |
-
- cumulative evaluation state
|
| 151 |
-
- OpenEnv-standard reward/done/metadata fields
|
| 152 |
-
"""
|
| 153 |
-
|
| 154 |
-
task: TaskDescriptor = Field(..., description="Current task details")
|
| 155 |
-
instructions: str = Field(
|
| 156 |
-
default="Inspect the code and submit structured findings.",
|
| 157 |
-
description="Episode instructions shown to the agent",
|
| 158 |
-
)
|
| 159 |
-
feedback: str = Field(default="", description="Feedback for the last action")
|
| 160 |
-
submitted_findings: List[ReviewFinding] = Field(
|
| 161 |
-
default_factory=list,
|
| 162 |
-
description="All findings submitted so far in this episode",
|
| 163 |
-
)
|
| 164 |
-
hints_used: int = Field(default=0, ge=0)
|
| 165 |
-
attempts_remaining: int = Field(default=0, ge=0)
|
| 166 |
-
evaluation: TaskEvaluation = Field(default_factory=TaskEvaluation)
|
| 167 |
-
score: float = Field(
|
| 168 |
-
default=0.0,
|
| 169 |
-
ge=0.0,
|
| 170 |
-
le=1.0,
|
| 171 |
-
description="Current task score after this step",
|
| 172 |
-
)
|
| 173 |
-
review_time_ms: float = Field(default=0.0, ge=0.0)
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
class EpisodeRecord(BaseModel):
|
| 177 |
-
"""Stored summary of a completed or in-progress episode.
|
| 178 |
-
|
| 179 |
-
This model is used by the custom history routes and is intentionally
|
| 180 |
-
compact enough to archive for later analysis or dataset creation.
|
| 181 |
-
"""
|
| 182 |
-
|
| 183 |
-
episode_id: str
|
| 184 |
-
task_id: str
|
| 185 |
-
difficulty: Difficulty
|
| 186 |
-
title: str
|
| 187 |
-
final_score: float = Field(ge=0.0, le=1.0)
|
| 188 |
-
passed: bool = Field(default=False)
|
| 189 |
-
steps_taken: int = Field(default=0, ge=0)
|
| 190 |
-
hints_used: int = Field(default=0, ge=0)
|
| 191 |
-
matched_findings: int = Field(default=0, ge=0)
|
| 192 |
-
total_findings: int = Field(default=0, ge=0)
|
| 193 |
-
false_positives: int = Field(default=0, ge=0)
|
| 194 |
-
duplicate_findings: int = Field(default=0, ge=0)
|
| 195 |
-
status: Literal["active", "completed"] = Field(default="completed")
|
| 196 |
-
created_at: str
|
| 197 |
-
updated_at: str
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
class DirectReviewRequest(BaseModel):
|
| 201 |
-
"""Request model for ad-hoc review outside the benchmark tasks."""
|
| 202 |
-
|
| 203 |
-
code: str = Field(..., description="Python source code to inspect")
|
| 204 |
-
context: Optional[str] = Field(
|
| 205 |
-
default=None, description="Optional explanation of the code's purpose"
|
| 206 |
-
)
|
| 207 |
|
| 208 |
|
| 209 |
class DirectReviewResponse(BaseModel):
|
| 210 |
-
"""
|
| 211 |
-
|
| 212 |
-
This route is useful for manual testing and dataset generation because it
|
| 213 |
-
lets you review arbitrary snippets without entering the benchmark loop.
|
| 214 |
-
"""
|
| 215 |
|
| 216 |
issues: List[ReviewFinding] = Field(default_factory=list)
|
| 217 |
summary: str = Field(default="")
|
|
@@ -219,30 +187,26 @@ class DirectReviewResponse(BaseModel):
|
|
| 219 |
improved_code: Optional[str] = Field(default=None)
|
| 220 |
|
| 221 |
|
| 222 |
-
class
|
| 223 |
-
"""
|
| 224 |
-
|
| 225 |
-
detail: str
|
| 226 |
-
|
| 227 |
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
CodeReviewConfig = PythonEnvConfig
|
|
|
|
| 1 |
+
"""Typed models for Python code review and repair environment."""
|
| 2 |
|
| 3 |
+
from __future__ import annotations
|
| 4 |
|
| 5 |
+
from typing import Any, Dict, List, Literal, Optional
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
from pydantic import BaseModel, Field
|
|
|
|
| 8 |
|
| 9 |
+
from compat import Action, Observation, State
|
| 10 |
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
+
Difficulty = Literal["easy", "medium", "hard"]
|
| 13 |
+
TaskKind = Literal["syntax_fix", "bug_fix", "optimization"]
|
| 14 |
+
ActionType = Literal["analyze_code", "edit_code", "run_tests", "submit_solution"]
|
| 15 |
+
Category = Literal["bug", "security", "performance", "maintainability", "style", "testing"]
|
| 16 |
Severity = Literal["critical", "warning", "info"]
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class HistoryEntry(BaseModel):
|
| 20 |
+
"""Record of one action taken during an episode."""
|
| 21 |
+
|
| 22 |
+
step: int = Field(..., ge=0)
|
| 23 |
+
action_type: ActionType
|
| 24 |
+
status: str = Field(..., description="Outcome message")
|
| 25 |
+
reward: float = Field(...)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class RewardDetails(BaseModel):
|
| 29 |
+
"""Detailed reward breakdown for transparent agent feedback.
|
| 30 |
+
|
| 31 |
+
The reward system is dynamic and multi-component, with 6 independent sources:
|
| 32 |
+
|
| 33 |
+
1. Progress Reward (max +0.25)
|
| 34 |
+
- Awarded for score improvement from previous step
|
| 35 |
+
- Formula: min(PROGRESS_SCALE * score_delta, 0.25)
|
| 36 |
+
- Encourages continuous improvement
|
| 37 |
+
|
| 38 |
+
2. Syntax Reward (max +0.35)
|
| 39 |
+
- One-time bonus for fixing syntax errors (first compile)
|
| 40 |
+
- Applied when code transitions from uncompilable to compilable
|
| 41 |
+
- Acknowledges the critical first step of valid code
|
| 42 |
+
|
| 43 |
+
3. Test Reward (max +0.20)
|
| 44 |
+
- Based on improvement in test pass rate
|
| 45 |
+
- Formula: min(TEST_PASS_REWARD_SCALE * test_improvement, 0.20)
|
| 46 |
+
- Rewards incremental test progress
|
| 47 |
+
|
| 48 |
+
4. Quality Reward (max +0.15)
|
| 49 |
+
- Based on AST-detected code quality metrics
|
| 50 |
+
- Rewards improvements in structure, readability, best practices
|
| 51 |
+
- Uses deterministic grader feedback
|
| 52 |
+
|
| 53 |
+
5. Stagnation Penalty (−0.10)
|
| 54 |
+
- Applied when agent acts but code doesn't change
|
| 55 |
+
- Encourages editing rather than repeated analysis
|
| 56 |
+
- Configurable via STAGNATION_PENALTY constant
|
| 57 |
+
|
| 58 |
+
6. Regression Penalty (scale −0.20)
|
| 59 |
+
- Applied when score decreases from previous step
|
| 60 |
+
- Formula: REGRESSION_PENALTY_SCALE * abs(score_delta)
|
| 61 |
+
- Discourages actions that make code worse
|
| 62 |
+
|
| 63 |
+
Final Reward: clamp(progress + syntax + test + quality - stagnation - regression, -1.0, +1.0)
|
| 64 |
+
|
| 65 |
+
The result is always bounded in [-1.0, +1.0], providing interpretable feedback for learning.
|
| 66 |
+
"""
|
| 67 |
+
|
| 68 |
+
value: float = Field(..., description="Net scalar reward for this step (bounded in [-1.0, +1.0])")
|
| 69 |
+
syntax_reward: float = Field(default=0.0, description="Bonus for fixing syntax errors (max +0.35)")
|
| 70 |
+
test_reward: float = Field(default=0.0, description="Reward from test improvements (max +0.20)")
|
| 71 |
+
quality_bonus: float = Field(default=0.0, description="Bonus for code quality improvements (max +0.15)")
|
| 72 |
+
correctness_bonus: float = Field(default=0.0, description="Bonus for full correctness (max +0.50)")
|
| 73 |
+
progress_delta: float = Field(default=0.0, description="Reward from score improvement (max +0.25)")
|
| 74 |
+
stagnation_penalty: float = Field(default=0.0, description="Penalty for unchanged code (−0.10)")
|
| 75 |
+
regression_penalty: float = Field(default=0.0, description="Penalty for score decline (scale −0.20)")
|
| 76 |
+
invalid_action_penalty: float = Field(default=0.0, description="Penalty for invalid actions (−0.15)")
|
| 77 |
+
timeout_penalty: float = Field(default=0.0, description="Penalty for execution timeout (−0.15)")
|
| 78 |
+
reason: str = Field(..., description="Human-readable explanation of the reward")
|
| 79 |
+
|
| 80 |
+
# Debug information for transparency
|
| 81 |
+
prev_score: float = Field(default=0.0, description="Score before this step")
|
| 82 |
+
curr_score: float = Field(default=0.0, description="Score after this step")
|
| 83 |
+
code_changed: bool = Field(default=False, description="Whether the action modified the code")
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
class PythonCodeReviewAction(Action):
|
| 87 |
+
"""Action space for code review environment."""
|
| 88 |
+
|
| 89 |
+
action_type: ActionType = Field(..., description="Type of action to perform")
|
| 90 |
+
code: Optional[str] = Field(default=None, description="New code for edit_code actions")
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
class PythonCodeReviewObservation(Observation):
|
| 94 |
+
"""Observation returned by reset() and step()."""
|
| 95 |
+
|
| 96 |
+
task_id: str = Field(..., description="Current task identifier")
|
| 97 |
+
title: str = Field(default="", description="Human-readable task title")
|
| 98 |
+
difficulty: Difficulty = Field(..., description="Task difficulty level")
|
| 99 |
+
task_kind: Optional[TaskKind] = Field(default=None, description="Task type")
|
| 100 |
+
task_description: str = Field(..., description="Detailed task description")
|
| 101 |
+
current_code: str = Field(..., description="Current code state")
|
| 102 |
+
errors: str = Field(..., description="Syntax/compilation errors, if any")
|
| 103 |
+
test_results: str = Field(..., description="Results from test execution")
|
| 104 |
+
visible_tests: List[str] = Field(default_factory=list, description="Public test cases")
|
| 105 |
+
history: List[HistoryEntry] = Field(default_factory=list, description="Action history")
|
| 106 |
+
attempts_remaining: int = Field(..., ge=0, description="Actions left in episode")
|
| 107 |
+
last_action_status: str = Field(default="", description="Outcome message from the last action")
|
| 108 |
+
score: float = Field(..., ge=0.0, le=1.0, description="Current episode score")
|
| 109 |
+
reward_details: RewardDetails = Field(
|
| 110 |
+
default_factory=lambda: RewardDetails(value=0.0, reason="Reset"),
|
| 111 |
+
description="Detailed reward breakdown for the last action",
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
class PythonCodeReviewState(State):
|
| 116 |
+
"""Exposed environment state."""
|
| 117 |
+
|
| 118 |
+
episode_id: str = Field(..., description="Unique episode identifier")
|
| 119 |
+
step_count: int = Field(default=0, ge=0)
|
| 120 |
+
task_id: Optional[str] = Field(default=None)
|
| 121 |
+
difficulty: Optional[Difficulty] = Field(default=None)
|
| 122 |
+
task_kind: Optional[TaskKind] = Field(default=None)
|
| 123 |
+
attempts_remaining: int = Field(default=0, ge=0)
|
| 124 |
+
current_code: str = Field(default="")
|
| 125 |
+
errors: str = Field(default="")
|
| 126 |
+
test_results: str = Field(default="")
|
| 127 |
+
history: List[HistoryEntry] = Field(default_factory=list)
|
| 128 |
+
score: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 129 |
+
done: bool = Field(default=False)
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
class TaskDescriptor(BaseModel):
|
| 133 |
+
"""Public task metadata."""
|
| 134 |
|
| 135 |
+
task_id: str = Field(..., description="Stable task identifier")
|
| 136 |
+
title: str = Field(..., description="Human-readable title")
|
| 137 |
+
difficulty: Difficulty = Field(..., description="Difficulty level")
|
| 138 |
+
task_kind: Optional[TaskKind] = Field(default=None, description="Type of task")
|
| 139 |
+
task_description: str = Field(default="", description="Full task description")
|
| 140 |
+
starter_code: str = Field(default="", description="Initial broken code")
|
| 141 |
+
visible_tests: List[str] = Field(default_factory=list, description="Public test cases")
|
| 142 |
+
goal: str = Field(default="", description="Optional goal summary for review-style tasks")
|
| 143 |
+
repo_summary: str = Field(default="", description="Optional repository context")
|
| 144 |
+
changed_files: List[str] = Field(default_factory=list, description="Changed files for review-style tasks")
|
| 145 |
+
available_files: List[str] = Field(default_factory=list, description="Browsable files for review-style tasks")
|
| 146 |
+
max_steps: int = Field(..., ge=1, description="Maximum steps allowed")
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
class TaskSummary(BaseModel):
|
| 150 |
+
"""Lightweight task metadata for list endpoints."""
|
| 151 |
|
| 152 |
+
task_id: str = Field(..., description="Stable task identifier")
|
| 153 |
+
difficulty: Difficulty = Field(..., description="Difficulty level")
|
| 154 |
+
title: str = Field(..., description="Human-readable title")
|
| 155 |
+
goal: str = Field(default="", description="Optional task goal")
|
| 156 |
|
| 157 |
|
| 158 |
class ReviewFinding(BaseModel):
|
| 159 |
+
"""Structured code review finding used by auxiliary review utilities."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
|
| 161 |
+
title: str = Field(..., description="Short human-readable finding title")
|
| 162 |
+
file_path: str = Field(default="", description="Optional file path")
|
| 163 |
+
line: Optional[int] = Field(default=None, ge=1, description="Optional 1-based line number")
|
| 164 |
+
category: Category = Field(default="bug", description="Finding category")
|
| 165 |
+
severity: Severity = Field(default="warning", description="Finding severity")
|
| 166 |
+
rationale: str = Field(default="", description="Why this matters")
|
| 167 |
+
recommendation: str = Field(default="", description="Suggested remediation")
|
| 168 |
+
rule_id: str = Field(default="", description="Stable detector or rubric identifier")
|
| 169 |
|
| 170 |
+
@property
|
| 171 |
+
def explanation(self) -> str:
|
| 172 |
+
"""Backward-compatible alias used by older grading helpers."""
|
| 173 |
+
return self.rationale
|
| 174 |
|
| 175 |
+
@property
|
| 176 |
+
def suggested_fix(self) -> str:
|
| 177 |
+
"""Backward-compatible alias used by older grading helpers."""
|
| 178 |
+
return self.recommendation
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
|
| 180 |
|
| 181 |
class DirectReviewResponse(BaseModel):
|
| 182 |
+
"""Response payload for deterministic direct-review utilities."""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
|
| 184 |
issues: List[ReviewFinding] = Field(default_factory=list)
|
| 185 |
summary: str = Field(default="")
|
|
|
|
| 187 |
improved_code: Optional[str] = Field(default=None)
|
| 188 |
|
| 189 |
|
| 190 |
+
class TaskGrade(BaseModel):
|
| 191 |
+
"""Grading result for task submission."""
|
|
|
|
|
|
|
|
|
|
| 192 |
|
| 193 |
+
score: float = Field(..., ge=0.0, le=1.0, description="Overall score")
|
| 194 |
+
syntax_score: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 195 |
+
tests_passed: int = Field(default=0, ge=0)
|
| 196 |
+
tests_total: int = Field(default=0, ge=0)
|
| 197 |
+
quality_score: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 198 |
+
runtime_score: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 199 |
+
timed_out: bool = Field(default=False)
|
| 200 |
+
matched_issue_ids: List[str] = Field(default_factory=list)
|
| 201 |
+
false_positives: int = Field(default=0, ge=0)
|
| 202 |
+
duplicate_findings: int = Field(default=0, ge=0)
|
| 203 |
+
matched_weight: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 204 |
+
details: Dict[str, Any] = Field(default_factory=dict)
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
class HealthResponse(BaseModel):
|
| 208 |
+
"""Health check response."""
|
| 209 |
+
|
| 210 |
+
status: Literal["ok"] = "ok"
|
| 211 |
+
environment: str = "python_code_review_env"
|
| 212 |
+
task_count: int = Field(default=0, ge=0)
|
|
|
openenv.yaml
CHANGED
|
@@ -1,7 +1,20 @@
|
|
| 1 |
-
spec_version: 1
|
| 2 |
-
name:
|
| 3 |
-
type: space
|
| 4 |
-
runtime: fastapi
|
| 5 |
-
app: server.app:app
|
| 6 |
-
port: 8000
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
spec_version: 1
|
| 2 |
+
name: python_code_review_env
|
| 3 |
+
type: space
|
| 4 |
+
runtime: fastapi
|
| 5 |
+
app: server.app:app
|
| 6 |
+
port: 8000
|
| 7 |
+
|
| 8 |
+
metadata:
|
| 9 |
+
description: "Production-grade Python code review and repair benchmark for OpenEnv"
|
| 10 |
+
domain: code-review
|
| 11 |
+
task_count: 3
|
| 12 |
+
task_ids:
|
| 13 |
+
- syntax-fix-easy
|
| 14 |
+
- bug-fix-medium
|
| 15 |
+
- optimization-hard
|
| 16 |
+
difficulty_levels:
|
| 17 |
+
- easy
|
| 18 |
+
- medium
|
| 19 |
+
- hard
|
| 20 |
+
|
openenv_python_env.egg-info/PKG-INFO
CHANGED
|
@@ -1,10 +1,13 @@
|
|
| 1 |
Metadata-Version: 2.4
|
| 2 |
Name: openenv-python_env
|
| 3 |
-
Version: 0.
|
| 4 |
-
Summary: Python
|
| 5 |
Requires-Python: >=3.10
|
| 6 |
Requires-Dist: openenv-core[core]>=0.2.2
|
| 7 |
-
Requires-Dist:
|
|
|
|
|
|
|
|
|
|
| 8 |
Provides-Extra: dev
|
| 9 |
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
| 10 |
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
|
|
| 1 |
Metadata-Version: 2.4
|
| 2 |
Name: openenv-python_env
|
| 3 |
+
Version: 0.2.0
|
| 4 |
+
Summary: Deterministic Python code review and repair benchmark environment for OpenEnv
|
| 5 |
Requires-Python: >=3.10
|
| 6 |
Requires-Dist: openenv-core[core]>=0.2.2
|
| 7 |
+
Requires-Dist: fastapi>=0.115.0
|
| 8 |
+
Requires-Dist: uvicorn>=0.30.0
|
| 9 |
+
Requires-Dist: openai>=1.40.0
|
| 10 |
+
Requires-Dist: pytest>=8.0.0
|
| 11 |
Provides-Extra: dev
|
| 12 |
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
| 13 |
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
openenv_python_env.egg-info/SOURCES.txt
CHANGED
|
@@ -1,11 +1,8 @@
|
|
| 1 |
README.md
|
| 2 |
-
__init__.py
|
| 3 |
-
client.py
|
| 4 |
-
inference.py
|
| 5 |
-
models.py
|
| 6 |
pyproject.toml
|
| 7 |
./__init__.py
|
| 8 |
./client.py
|
|
|
|
| 9 |
./inference.py
|
| 10 |
./models.py
|
| 11 |
openenv_python_env.egg-info/PKG-INFO
|
|
@@ -16,4 +13,15 @@ openenv_python_env.egg-info/requires.txt
|
|
| 16 |
openenv_python_env.egg-info/top_level.txt
|
| 17 |
server/__init__.py
|
| 18 |
server/app.py
|
| 19 |
-
server/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
README.md
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
pyproject.toml
|
| 3 |
./__init__.py
|
| 4 |
./client.py
|
| 5 |
+
./compat.py
|
| 6 |
./inference.py
|
| 7 |
./models.py
|
| 8 |
openenv_python_env.egg-info/PKG-INFO
|
|
|
|
| 13 |
openenv_python_env.egg-info/top_level.txt
|
| 14 |
server/__init__.py
|
| 15 |
server/app.py
|
| 16 |
+
server/code_review_env_environment.py
|
| 17 |
+
server/code_review_environment.py
|
| 18 |
+
server/env.py
|
| 19 |
+
server/env_safe.py
|
| 20 |
+
server/grading.py
|
| 21 |
+
server/python_env_environment.py
|
| 22 |
+
server/static_review.py
|
| 23 |
+
server/task_bank.py
|
| 24 |
+
tests/test_api.py
|
| 25 |
+
tests/test_environment.py
|
| 26 |
+
tests/test_examples.py
|
| 27 |
+
tests/test_reward_dynamics.py
|
openenv_python_env.egg-info/requires.txt
CHANGED
|
@@ -1,5 +1,8 @@
|
|
| 1 |
openenv-core[core]>=0.2.2
|
| 2 |
-
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
[dev]
|
| 5 |
pytest>=8.0.0
|
|
|
|
| 1 |
openenv-core[core]>=0.2.2
|
| 2 |
+
fastapi>=0.115.0
|
| 3 |
+
uvicorn>=0.30.0
|
| 4 |
+
openai>=1.40.0
|
| 5 |
+
pytest>=8.0.0
|
| 6 |
|
| 7 |
[dev]
|
| 8 |
pytest>=8.0.0
|
pyproject.toml
CHANGED
|
@@ -1,46 +1,33 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
[
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
"
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
[
|
| 33 |
-
|
| 34 |
-
"pytest>=8.0.0",
|
| 35 |
-
"pytest-cov>=4.0.0",
|
| 36 |
-
]
|
| 37 |
-
|
| 38 |
-
[project.scripts]
|
| 39 |
-
# Server entry point - enables running via: uv run --project . server
|
| 40 |
-
# or: python -m python_env.server.app
|
| 41 |
-
server = "python_env.server.app:main"
|
| 42 |
-
|
| 43 |
-
[tool.setuptools]
|
| 44 |
-
include-package-data = true
|
| 45 |
-
packages = ["python_env", "python_env.server"]
|
| 46 |
-
package-dir = { "python_env" = ".", "python_env.server" = "server" }
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=45", "wheel"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "openenv-python_env"
|
| 7 |
+
version = "0.2.0"
|
| 8 |
+
description = "Deterministic Python code review and repair benchmark environment for OpenEnv"
|
| 9 |
+
requires-python = ">=3.10"
|
| 10 |
+
dependencies = [
|
| 11 |
+
"openenv-core[core]>=0.2.2",
|
| 12 |
+
"fastapi>=0.115.0",
|
| 13 |
+
"uvicorn>=0.30.0",
|
| 14 |
+
"openai>=1.40.0",
|
| 15 |
+
"pytest>=8.0.0",
|
| 16 |
+
]
|
| 17 |
+
|
| 18 |
+
[project.optional-dependencies]
|
| 19 |
+
dev = [
|
| 20 |
+
"pytest>=8.0.0",
|
| 21 |
+
"pytest-cov>=4.0.0",
|
| 22 |
+
]
|
| 23 |
+
|
| 24 |
+
[project.scripts]
|
| 25 |
+
server = "python_env.server.app:main"
|
| 26 |
+
|
| 27 |
+
[tool.setuptools]
|
| 28 |
+
include-package-data = true
|
| 29 |
+
packages = ["python_env", "python_env.server"]
|
| 30 |
+
package-dir = { "python_env" = ".", "python_env.server" = "server" }
|
| 31 |
+
|
| 32 |
+
[tool.pytest.ini_options]
|
| 33 |
+
testpaths = ["tests"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pytest-cache-files-1f62ra1g/CACHEDIR.TAG
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Signature: 8a477f597d28d172789f06886806bc55
|
| 2 |
+
# This file is a cache directory tag created by pytest.
|
| 3 |
+
# For information about cache directory tags, see:
|
| 4 |
+
# https://bford.info/cachedir/spec.html
|
pytest-cache-files-1f62ra1g/README.md
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# pytest cache directory #
|
| 2 |
+
|
| 3 |
+
This directory contains data from the pytest's cache plugin,
|
| 4 |
+
which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
|
| 5 |
+
|
| 6 |
+
**Do not** commit this to version control.
|
| 7 |
+
|
| 8 |
+
See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
|
pytest-cache-files-i2cpw3zw/CACHEDIR.TAG
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Signature: 8a477f597d28d172789f06886806bc55
|
| 2 |
+
# This file is a cache directory tag created by pytest.
|
| 3 |
+
# For information about cache directory tags, see:
|
| 4 |
+
# https://bford.info/cachedir/spec.html
|
pytest-cache-files-i2cpw3zw/README.md
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# pytest cache directory #
|
| 2 |
+
|
| 3 |
+
This directory contains data from the pytest's cache plugin,
|
| 4 |
+
which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
|
| 5 |
+
|
| 6 |
+
**Do not** commit this to version control.
|
| 7 |
+
|
| 8 |
+
See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
|
pytest-cache-files-le0qcl0z/CACHEDIR.TAG
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Signature: 8a477f597d28d172789f06886806bc55
|
| 2 |
+
# This file is a cache directory tag created by pytest.
|
| 3 |
+
# For information about cache directory tags, see:
|
| 4 |
+
# https://bford.info/cachedir/spec.html
|
pytest-cache-files-le0qcl0z/README.md
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# pytest cache directory #
|
| 2 |
+
|
| 3 |
+
This directory contains data from the pytest's cache plugin,
|
| 4 |
+
which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
|
| 5 |
+
|
| 6 |
+
**Do not** commit this to version control.
|
| 7 |
+
|
| 8 |
+
See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
|
pytest-cache-files-qm8xzmpt/CACHEDIR.TAG
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Signature: 8a477f597d28d172789f06886806bc55
|
| 2 |
+
# This file is a cache directory tag created by pytest.
|
| 3 |
+
# For information about cache directory tags, see:
|
| 4 |
+
# https://bford.info/cachedir/spec.html
|
pytest-cache-files-qm8xzmpt/README.md
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# pytest cache directory #
|
| 2 |
+
|
| 3 |
+
This directory contains data from the pytest's cache plugin,
|
| 4 |
+
which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
|
| 5 |
+
|
| 6 |
+
**Do not** commit this to version control.
|
| 7 |
+
|
| 8 |
+
See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
|
pytest-cache-files-qun9v98v/CACHEDIR.TAG
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Signature: 8a477f597d28d172789f06886806bc55
|
| 2 |
+
# This file is a cache directory tag created by pytest.
|
| 3 |
+
# For information about cache directory tags, see:
|
| 4 |
+
# https://bford.info/cachedir/spec.html
|
pytest-cache-files-qun9v98v/README.md
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# pytest cache directory #
|
| 2 |
+
|
| 3 |
+
This directory contains data from the pytest's cache plugin,
|
| 4 |
+
which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
|
| 5 |
+
|
| 6 |
+
**Do not** commit this to version control.
|
| 7 |
+
|
| 8 |
+
See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
|
pytest-cache-files-srp2otxc/CACHEDIR.TAG
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Signature: 8a477f597d28d172789f06886806bc55
|
| 2 |
+
# This file is a cache directory tag created by pytest.
|
| 3 |
+
# For information about cache directory tags, see:
|
| 4 |
+
# https://bford.info/cachedir/spec.html
|
pytest-cache-files-srp2otxc/README.md
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# pytest cache directory #
|
| 2 |
+
|
| 3 |
+
This directory contains data from the pytest's cache plugin,
|
| 4 |
+
which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
|
| 5 |
+
|
| 6 |
+
**Do not** commit this to version control.
|
| 7 |
+
|
| 8 |
+
See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
|
pytest-cache-files-u6t7g29i/CACHEDIR.TAG
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Signature: 8a477f597d28d172789f06886806bc55
|
| 2 |
+
# This file is a cache directory tag created by pytest.
|
| 3 |
+
# For information about cache directory tags, see:
|
| 4 |
+
# https://bford.info/cachedir/spec.html
|
pytest-cache-files-u6t7g29i/README.md
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# pytest cache directory #
|
| 2 |
+
|
| 3 |
+
This directory contains data from the pytest's cache plugin,
|
| 4 |
+
which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
|
| 5 |
+
|
| 6 |
+
**Do not** commit this to version control.
|
| 7 |
+
|
| 8 |
+
See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
|
pytest-cache-files-x1yzwik9/CACHEDIR.TAG
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Signature: 8a477f597d28d172789f06886806bc55
|
| 2 |
+
# This file is a cache directory tag created by pytest.
|
| 3 |
+
# For information about cache directory tags, see:
|
| 4 |
+
# https://bford.info/cachedir/spec.html
|
pytest-cache-files-x1yzwik9/README.md
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# pytest cache directory #
|
| 2 |
+
|
| 3 |
+
This directory contains data from the pytest's cache plugin,
|
| 4 |
+
which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
|
| 5 |
+
|
| 6 |
+
**Do not** commit this to version control.
|
| 7 |
+
|
| 8 |
+
See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
|
server/__init__.py
CHANGED
|
@@ -1,11 +1,5 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
"""Python Env environment server components."""
|
| 8 |
-
|
| 9 |
-
from .python_env_environment import PythonEnvironment
|
| 10 |
-
|
| 11 |
-
__all__ = ["PythonEnvironment"]
|
|
|
|
| 1 |
+
"""Server exports for the Python code review environment."""
|
| 2 |
+
|
| 3 |
+
from .code_review_environment import CodeReviewEnvironment, PythonCodeReviewEnvironment, PythonEnvironment
|
| 4 |
+
|
| 5 |
+
__all__ = ["PythonEnvironment", "PythonCodeReviewEnvironment", "CodeReviewEnvironment"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
server/app.py
CHANGED
|
@@ -1,84 +1,117 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
# Production:
|
| 25 |
-
uvicorn server.app:app --host 0.0.0.0 --port 8000 --workers 4
|
| 26 |
-
|
| 27 |
-
# Or run directly:
|
| 28 |
-
python -m server.app
|
| 29 |
-
"""
|
| 30 |
-
|
| 31 |
try:
|
| 32 |
-
|
| 33 |
-
except Exception
|
| 34 |
-
|
| 35 |
-
"openenv is required for the web interface. Install dependencies with '\n uv sync\n'"
|
| 36 |
-
) from e
|
| 37 |
-
|
| 38 |
-
try:
|
| 39 |
-
from ..models import PythonAction, PythonObservation
|
| 40 |
-
from .python_env_environment import PythonEnvironment
|
| 41 |
-
except ImportError:
|
| 42 |
-
from models import PythonAction, PythonObservation
|
| 43 |
-
from server.python_env_environment import PythonEnvironment
|
| 44 |
-
|
| 45 |
|
| 46 |
-
|
| 47 |
app = create_app(
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
)
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""FastAPI application for the Python code review environment."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
from fastapi import APIRouter, HTTPException
|
| 8 |
+
from fastapi.responses import RedirectResponse
|
| 9 |
+
|
| 10 |
+
from compat import create_app
|
| 11 |
+
|
| 12 |
+
from models import (
|
| 13 |
+
HealthResponse,
|
| 14 |
+
PythonCodeReviewAction,
|
| 15 |
+
PythonCodeReviewObservation,
|
| 16 |
+
PythonCodeReviewState,
|
| 17 |
+
TaskDescriptor,
|
| 18 |
+
TaskGrade,
|
| 19 |
+
)
|
| 20 |
+
from server.env import PythonCodeReviewEnvironment
|
| 21 |
+
|
| 22 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
try:
|
| 24 |
+
MAX_CONCURRENT_ENVS = max(int(os.getenv("MAX_CONCURRENT_ENVS", "16")), 1)
|
| 25 |
+
except Exception:
|
| 26 |
+
MAX_CONCURRENT_ENVS = 16
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
+
python_env = PythonCodeReviewEnvironment(verbose=False)
|
| 29 |
app = create_app(
|
| 30 |
+
PythonCodeReviewEnvironment,
|
| 31 |
+
PythonCodeReviewAction,
|
| 32 |
+
PythonCodeReviewObservation,
|
| 33 |
+
max_concurrent_envs=MAX_CONCURRENT_ENVS,
|
| 34 |
+
)
|
| 35 |
+
router = APIRouter(tags=["python-code-review"])
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
@router.get("/", include_in_schema=False)
|
| 39 |
+
def root() -> RedirectResponse:
|
| 40 |
+
"""Redirect root to API documentation."""
|
| 41 |
+
return RedirectResponse(url="/docs")
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
@router.get("/health", response_model=HealthResponse)
|
| 45 |
+
def health() -> HealthResponse:
|
| 46 |
+
"""Health check endpoint for deployment monitoring."""
|
| 47 |
+
return python_env.health()
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
@router.get("/tasks", response_model=list)
|
| 51 |
+
def list_tasks() -> list:
|
| 52 |
+
"""List all available deterministic tasks."""
|
| 53 |
+
return python_env.list_task_summaries()
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
@router.get("/tasks/{task_id}", response_model=object)
|
| 57 |
+
def get_task(task_id: str) -> object:
|
| 58 |
+
"""Get a specific task by ID."""
|
| 59 |
+
try:
|
| 60 |
+
return python_env.get_task(task_id)
|
| 61 |
+
except ValueError as exc:
|
| 62 |
+
raise HTTPException(status_code=404, detail=str(exc)) from exc
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
@router.post("/tasks/{task_id}/grade", response_model=TaskGrade)
|
| 66 |
+
def grade_task(task_id: str, payload: PythonCodeReviewAction) -> TaskGrade:
|
| 67 |
+
"""Grade code submission for a task without running an episode."""
|
| 68 |
+
if payload.action_type != "edit_code" or not payload.code:
|
| 69 |
+
raise HTTPException(
|
| 70 |
+
status_code=400,
|
| 71 |
+
detail="Requires action_type='edit_code' with code parameter."
|
| 72 |
+
)
|
| 73 |
+
try:
|
| 74 |
+
return python_env.grade_task_submission(task_id=task_id, code=payload.code)
|
| 75 |
+
except ValueError as exc:
|
| 76 |
+
raise HTTPException(status_code=404, detail=str(exc)) from exc
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
@router.post("/state", response_model=PythonCodeReviewState)
|
| 80 |
+
def get_state_post() -> RedirectResponse:
|
| 81 |
+
"""Redirect POST /state to GET for compatibility."""
|
| 82 |
+
return RedirectResponse(url="/state", status_code=303)
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
app.include_router(router)
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def _prioritize_route(path: str, methods: set[str]) -> None:
|
| 89 |
+
"""Move a matching custom route ahead of default OpenEnv routes."""
|
| 90 |
+
try:
|
| 91 |
+
for index in range(len(app.router.routes) - 1, -1, -1):
|
| 92 |
+
route = app.router.routes[index]
|
| 93 |
+
route_path = getattr(route, "path", None)
|
| 94 |
+
route_methods = set(getattr(route, "methods", set()) or set())
|
| 95 |
+
if route_path == path and methods.issubset(route_methods):
|
| 96 |
+
app.router.routes.insert(0, app.router.routes.pop(index))
|
| 97 |
+
break
|
| 98 |
+
except Exception:
|
| 99 |
+
pass
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
_prioritize_route("/health", {"GET"})
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def main(host: str = "0.0.0.0", port: int = 8000) -> None:
|
| 106 |
+
"""Run the FastAPI application with uvicorn."""
|
| 107 |
+
import uvicorn
|
| 108 |
+
uvicorn.run(
|
| 109 |
+
app,
|
| 110 |
+
host=os.getenv("HOST", host),
|
| 111 |
+
port=int(os.getenv("PORT", str(port))),
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
if __name__ == "__main__":
|
| 116 |
+
main()
|
| 117 |
+
|
server/code_review_env_environment.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Compatibility shim for older imports."""
|
| 2 |
+
|
| 3 |
+
try:
|
| 4 |
+
from server.code_review_environment import CodeReviewEnvironment
|
| 5 |
+
except ModuleNotFoundError: # pragma: no cover
|
| 6 |
+
from .code_review_environment import CodeReviewEnvironment
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
__all__ = ["CodeReviewEnvironment"]
|
server/code_review_environment.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Compatibility wrapper for older imports."""
|
| 2 |
+
|
| 3 |
+
from .env import CodeReviewEnvironment, PythonCodeReviewEnvironment, PythonEnvironment
|
| 4 |
+
|
| 5 |
+
__all__ = ["CodeReviewEnvironment", "PythonCodeReviewEnvironment", "PythonEnvironment"]
|
server/env.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
from .env_safe import * # noqa: F401,F403
|
server/env_safe.py
ADDED
|
@@ -0,0 +1,492 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Safe OpenEnv environment for deterministic Python code repair tasks."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Any, Optional
|
| 6 |
+
from uuid import uuid4
|
| 7 |
+
|
| 8 |
+
from compat import Environment
|
| 9 |
+
from graders import grade_task
|
| 10 |
+
from models import (
|
| 11 |
+
HealthResponse,
|
| 12 |
+
HistoryEntry,
|
| 13 |
+
PythonCodeReviewAction,
|
| 14 |
+
PythonCodeReviewObservation,
|
| 15 |
+
PythonCodeReviewState,
|
| 16 |
+
RewardDetails,
|
| 17 |
+
TaskGrade,
|
| 18 |
+
)
|
| 19 |
+
from tasks import TaskSpec, get_task as load_task, list_task_summaries, task_ids
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
INVALID_ACTION_PENALTY = 0.10
|
| 23 |
+
NO_PROGRESS_PENALTY = 0.08
|
| 24 |
+
REPEATED_ACTION_PENALTY = 0.05
|
| 25 |
+
BASE_STEP_PENALTY = 0.02
|
| 26 |
+
ANALYZE_STEP_PENALTY = 0.01
|
| 27 |
+
SUBMIT_COMPLETION_BONUS = 0.30
|
| 28 |
+
TIMEOUT_PENALTY = 0.12
|
| 29 |
+
VALID_ACTIONS = {"analyze_code", "edit_code", "run_tests", "submit_solution"}
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def _clamp(value: float, low: float = 0.0, high: float = 1.0) -> float:
|
| 33 |
+
"""Clamp a scalar to a bounded numeric interval."""
|
| 34 |
+
try:
|
| 35 |
+
return max(low, min(high, float(value)))
|
| 36 |
+
except Exception:
|
| 37 |
+
return low
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _safe_text(value: Any, default: str = "") -> str:
|
| 41 |
+
"""Convert values into short stable strings."""
|
| 42 |
+
try:
|
| 43 |
+
text = str(value)
|
| 44 |
+
except Exception:
|
| 45 |
+
return default
|
| 46 |
+
text = " ".join(text.split())
|
| 47 |
+
return text[:240] if text else default
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class PythonCodeReviewEnvironment(
|
| 51 |
+
Environment[PythonCodeReviewAction, PythonCodeReviewObservation, PythonCodeReviewState]
|
| 52 |
+
):
|
| 53 |
+
"""Deterministic, bounded, evaluator-safe environment for code repair tasks."""
|
| 54 |
+
|
| 55 |
+
SUPPORTS_CONCURRENT_SESSIONS = True
|
| 56 |
+
|
| 57 |
+
def __init__(self, verbose: bool = False) -> None:
|
| 58 |
+
super().__init__()
|
| 59 |
+
self._verbose = bool(verbose)
|
| 60 |
+
self._task_order = self._safe_task_order()
|
| 61 |
+
self._task_cursor = -1
|
| 62 |
+
self._task: Optional[TaskSpec] = None
|
| 63 |
+
self._state = PythonCodeReviewState(episode_id=str(uuid4()))
|
| 64 |
+
self._done = False
|
| 65 |
+
self._last_status = "Call reset() to start."
|
| 66 |
+
self._last_reward = RewardDetails(value=0.0, reason="Environment initialized.")
|
| 67 |
+
self._metrics = self._blank_metrics()
|
| 68 |
+
self._last_action_type = ""
|
| 69 |
+
|
| 70 |
+
def reset(
|
| 71 |
+
self,
|
| 72 |
+
seed: Optional[int] = None,
|
| 73 |
+
episode_id: Optional[str] = None,
|
| 74 |
+
task_id: Optional[str] = None,
|
| 75 |
+
**_: object,
|
| 76 |
+
) -> PythonCodeReviewObservation:
|
| 77 |
+
"""Reset the environment for a deterministic task and return an observation."""
|
| 78 |
+
del seed
|
| 79 |
+
try:
|
| 80 |
+
self._reset_rubric()
|
| 81 |
+
except Exception:
|
| 82 |
+
pass
|
| 83 |
+
|
| 84 |
+
task = self._select_task(task_id)
|
| 85 |
+
self._task = task
|
| 86 |
+
self._done = False
|
| 87 |
+
self._metrics = self._blank_metrics()
|
| 88 |
+
self._last_action_type = ""
|
| 89 |
+
self._last_status = "Inspect the code, run checks, edit the code, then submit."
|
| 90 |
+
self._last_reward = RewardDetails(
|
| 91 |
+
value=0.0,
|
| 92 |
+
reason="Episode reset.",
|
| 93 |
+
prev_score=0.0,
|
| 94 |
+
curr_score=0.0,
|
| 95 |
+
)
|
| 96 |
+
self._state = PythonCodeReviewState(
|
| 97 |
+
episode_id=episode_id or str(uuid4()),
|
| 98 |
+
step_count=0,
|
| 99 |
+
task_id=task.task_id,
|
| 100 |
+
difficulty=task.difficulty,
|
| 101 |
+
task_kind=task.task_kind,
|
| 102 |
+
attempts_remaining=max(int(task.max_steps), 1),
|
| 103 |
+
current_code=task.starter_code,
|
| 104 |
+
errors="",
|
| 105 |
+
test_results="No checks run yet.",
|
| 106 |
+
history=[],
|
| 107 |
+
score=0.0,
|
| 108 |
+
done=False,
|
| 109 |
+
)
|
| 110 |
+
return self._build_observation()
|
| 111 |
+
|
| 112 |
+
def step(
|
| 113 |
+
self,
|
| 114 |
+
action: PythonCodeReviewAction,
|
| 115 |
+
timeout_s: Optional[float] = None,
|
| 116 |
+
**_: object,
|
| 117 |
+
) -> PythonCodeReviewObservation:
|
| 118 |
+
"""Execute one safe environment step and always return a valid observation."""
|
| 119 |
+
del timeout_s
|
| 120 |
+
try:
|
| 121 |
+
if self._task is None:
|
| 122 |
+
return self.reset()
|
| 123 |
+
|
| 124 |
+
if self._done:
|
| 125 |
+
self._last_status = "Episode already completed. Call reset() to continue."
|
| 126 |
+
self._last_reward = RewardDetails(
|
| 127 |
+
value=-INVALID_ACTION_PENALTY,
|
| 128 |
+
invalid_action_penalty=INVALID_ACTION_PENALTY,
|
| 129 |
+
reason="Episode already completed.",
|
| 130 |
+
prev_score=self._metrics["score"],
|
| 131 |
+
curr_score=self._metrics["score"],
|
| 132 |
+
code_changed=False,
|
| 133 |
+
)
|
| 134 |
+
return self._build_observation()
|
| 135 |
+
|
| 136 |
+
self._state.step_count += 1
|
| 137 |
+
action_type = _safe_text(getattr(action, "action_type", "analyze_code"), "analyze_code")
|
| 138 |
+
code = getattr(action, "code", None)
|
| 139 |
+
|
| 140 |
+
if action_type == "analyze_code":
|
| 141 |
+
self._handle_scored_action(action_type=action_type, candidate_code=self._state.current_code, include_hidden=False)
|
| 142 |
+
elif action_type == "run_tests":
|
| 143 |
+
self._handle_scored_action(action_type=action_type, candidate_code=self._state.current_code, include_hidden=False)
|
| 144 |
+
elif action_type == "edit_code":
|
| 145 |
+
self._handle_edit(code)
|
| 146 |
+
elif action_type == "submit_solution":
|
| 147 |
+
self._handle_scored_action(action_type=action_type, candidate_code=self._state.current_code, include_hidden=True)
|
| 148 |
+
self._done = True
|
| 149 |
+
else:
|
| 150 |
+
self._apply_invalid_action(f"Unsupported action_type '{action_type}'.")
|
| 151 |
+
|
| 152 |
+
self._state.attempts_remaining = max(self._task.max_steps - self._state.step_count, 0)
|
| 153 |
+
if self._state.attempts_remaining == 0 and not self._done:
|
| 154 |
+
self._auto_submit()
|
| 155 |
+
|
| 156 |
+
self._state.done = self._done
|
| 157 |
+
return self._build_observation()
|
| 158 |
+
except Exception as exc:
|
| 159 |
+
self._apply_invalid_action(f"Step failure handled: {_safe_text(exc, 'unknown_error')}")
|
| 160 |
+
self._state.done = self._done
|
| 161 |
+
return self._build_observation()
|
| 162 |
+
|
| 163 |
+
@property
|
| 164 |
+
def state(self) -> PythonCodeReviewState:
|
| 165 |
+
"""Return a deep copy of the current environment state."""
|
| 166 |
+
try:
|
| 167 |
+
return self._state.model_copy(deep=True)
|
| 168 |
+
except Exception:
|
| 169 |
+
return PythonCodeReviewState(episode_id=str(uuid4()))
|
| 170 |
+
|
| 171 |
+
def list_task_summaries(self) -> list[object]:
|
| 172 |
+
"""Return public task summaries."""
|
| 173 |
+
try:
|
| 174 |
+
return list_task_summaries()
|
| 175 |
+
except Exception:
|
| 176 |
+
return []
|
| 177 |
+
|
| 178 |
+
def get_task(self, task_id: str) -> object:
|
| 179 |
+
"""Return a single public task descriptor."""
|
| 180 |
+
return self._select_task(task_id).to_descriptor()
|
| 181 |
+
|
| 182 |
+
def health(self) -> HealthResponse:
|
| 183 |
+
"""Return a simple health response."""
|
| 184 |
+
return HealthResponse(task_count=len(self._task_order))
|
| 185 |
+
|
| 186 |
+
def grade_task_submission(self, task_id: str, code: str) -> TaskGrade:
|
| 187 |
+
"""Grade a task submission outside an episode without raising."""
|
| 188 |
+
try:
|
| 189 |
+
task = self._select_task(task_id)
|
| 190 |
+
return self._safe_grade(task=task, candidate_code=code, include_hidden=True)
|
| 191 |
+
except Exception as exc:
|
| 192 |
+
return TaskGrade(score=0.0, details={"error": _safe_text(exc, "grading_failed")})
|
| 193 |
+
|
| 194 |
+
def run_tests(self, code: str, include_hidden: bool = False) -> tuple[float, dict[str, int], TaskGrade]:
|
| 195 |
+
"""Run deterministic grading and return score plus test summary."""
|
| 196 |
+
task = self._task or self._select_task(None)
|
| 197 |
+
grade = self._safe_grade(task=task, candidate_code=code, include_hidden=include_hidden)
|
| 198 |
+
return (
|
| 199 |
+
_clamp(grade.score),
|
| 200 |
+
{"passed": int(grade.tests_passed), "total": int(grade.tests_total)},
|
| 201 |
+
grade,
|
| 202 |
+
)
|
| 203 |
+
|
| 204 |
+
def apply_action(self, action: PythonCodeReviewAction) -> str:
|
| 205 |
+
"""Return the candidate code implied by the action."""
|
| 206 |
+
if getattr(action, "action_type", "") == "edit_code":
|
| 207 |
+
code = getattr(action, "code", None)
|
| 208 |
+
return str(code) if code is not None else self._state.current_code
|
| 209 |
+
return self._state.current_code
|
| 210 |
+
|
| 211 |
+
def compute_reward(
|
| 212 |
+
self,
|
| 213 |
+
action_type: str,
|
| 214 |
+
previous_metrics: dict[str, float],
|
| 215 |
+
current_metrics: dict[str, float],
|
| 216 |
+
grade: TaskGrade,
|
| 217 |
+
code_changed: bool,
|
| 218 |
+
invalid_action: bool = False,
|
| 219 |
+
) -> RewardDetails:
|
| 220 |
+
"""Compute a bounded dynamic reward with progress and efficiency shaping."""
|
| 221 |
+
prev_score = _clamp(previous_metrics.get("score", 0.0))
|
| 222 |
+
curr_score = _clamp(current_metrics.get("score", 0.0))
|
| 223 |
+
score_delta = curr_score - prev_score
|
| 224 |
+
test_delta = current_metrics.get("test_fraction", 0.0) - previous_metrics.get("test_fraction", 0.0)
|
| 225 |
+
syntax_delta = current_metrics.get("syntax_score", 0.0) - previous_metrics.get("syntax_score", 0.0)
|
| 226 |
+
quality_delta = current_metrics.get("quality_score", 0.0) - previous_metrics.get("quality_score", 0.0)
|
| 227 |
+
|
| 228 |
+
step_penalty = BASE_STEP_PENALTY + (ANALYZE_STEP_PENALTY if action_type == "analyze_code" else 0.0)
|
| 229 |
+
repeated_penalty = REPEATED_ACTION_PENALTY if action_type == self._last_action_type else 0.0
|
| 230 |
+
no_progress = (
|
| 231 |
+
score_delta <= 1e-9
|
| 232 |
+
and test_delta <= 1e-9
|
| 233 |
+
and syntax_delta <= 1e-9
|
| 234 |
+
and quality_delta <= 1e-9
|
| 235 |
+
and not code_changed
|
| 236 |
+
)
|
| 237 |
+
stagnation_penalty = NO_PROGRESS_PENALTY if no_progress and not invalid_action else 0.0
|
| 238 |
+
regression_penalty = max(-score_delta, 0.0) * 0.6 + repeated_penalty + step_penalty
|
| 239 |
+
invalid_penalty = INVALID_ACTION_PENALTY if invalid_action else 0.0
|
| 240 |
+
timeout_penalty = TIMEOUT_PENALTY if bool(grade.timed_out) else 0.0
|
| 241 |
+
|
| 242 |
+
progress_reward = max(score_delta, 0.0) * 0.7
|
| 243 |
+
syntax_reward = max(syntax_delta, 0.0) * 0.5
|
| 244 |
+
test_reward = max(test_delta, 0.0) * 1.0
|
| 245 |
+
quality_bonus = max(quality_delta, 0.0) * 0.2
|
| 246 |
+
correctness_bonus = SUBMIT_COMPLETION_BONUS if action_type == "submit_solution" and curr_score >= 0.999 else 0.0
|
| 247 |
+
|
| 248 |
+
reward_value = (
|
| 249 |
+
progress_reward
|
| 250 |
+
+ syntax_reward
|
| 251 |
+
+ test_reward
|
| 252 |
+
+ quality_bonus
|
| 253 |
+
+ correctness_bonus
|
| 254 |
+
- stagnation_penalty
|
| 255 |
+
- regression_penalty
|
| 256 |
+
- invalid_penalty
|
| 257 |
+
- timeout_penalty
|
| 258 |
+
)
|
| 259 |
+
reward_value = max(-1.0, min(1.0, round(reward_value, 6)))
|
| 260 |
+
return RewardDetails(
|
| 261 |
+
value=reward_value,
|
| 262 |
+
syntax_reward=round(syntax_reward, 6),
|
| 263 |
+
test_reward=round(test_reward, 6),
|
| 264 |
+
quality_bonus=round(quality_bonus, 6),
|
| 265 |
+
correctness_bonus=round(correctness_bonus, 6),
|
| 266 |
+
progress_delta=round(progress_reward, 6),
|
| 267 |
+
stagnation_penalty=round(stagnation_penalty, 6),
|
| 268 |
+
regression_penalty=round(regression_penalty, 6),
|
| 269 |
+
invalid_action_penalty=round(invalid_penalty, 6),
|
| 270 |
+
timeout_penalty=round(timeout_penalty, 6),
|
| 271 |
+
reason=f"{action_type} reward computed safely",
|
| 272 |
+
prev_score=round(prev_score, 6),
|
| 273 |
+
curr_score=round(curr_score, 6),
|
| 274 |
+
code_changed=bool(code_changed),
|
| 275 |
+
)
|
| 276 |
+
|
| 277 |
+
def _safe_task_order(self) -> list[str]:
|
| 278 |
+
"""Load deterministic task ids with a hard fallback."""
|
| 279 |
+
try:
|
| 280 |
+
loaded = list(task_ids())
|
| 281 |
+
if loaded:
|
| 282 |
+
return [str(task_id) for task_id in loaded]
|
| 283 |
+
except Exception:
|
| 284 |
+
pass
|
| 285 |
+
return ["syntax-fix-easy", "bug-fix-medium", "optimization-hard"]
|
| 286 |
+
|
| 287 |
+
def _blank_metrics(self) -> dict[str, float]:
|
| 288 |
+
"""Return an empty metric snapshot."""
|
| 289 |
+
return {
|
| 290 |
+
"score": 0.0,
|
| 291 |
+
"test_fraction": 0.0,
|
| 292 |
+
"syntax_score": 0.0,
|
| 293 |
+
"quality_score": 0.0,
|
| 294 |
+
}
|
| 295 |
+
|
| 296 |
+
def _select_task(self, task_id: Optional[str]) -> TaskSpec:
|
| 297 |
+
"""Select the requested task or advance deterministically."""
|
| 298 |
+
try:
|
| 299 |
+
if task_id:
|
| 300 |
+
task = load_task(task_id)
|
| 301 |
+
if task.task_id in self._task_order:
|
| 302 |
+
self._task_cursor = self._task_order.index(task.task_id)
|
| 303 |
+
return task
|
| 304 |
+
except Exception:
|
| 305 |
+
pass
|
| 306 |
+
|
| 307 |
+
try:
|
| 308 |
+
self._task_cursor = (self._task_cursor + 1) % len(self._task_order)
|
| 309 |
+
return load_task(self._task_order[self._task_cursor])
|
| 310 |
+
except Exception:
|
| 311 |
+
return load_task("syntax-fix-easy")
|
| 312 |
+
|
| 313 |
+
def _safe_grade(self, task: TaskSpec, candidate_code: str, include_hidden: bool) -> TaskGrade:
|
| 314 |
+
"""Run grading without allowing exceptions to escape."""
|
| 315 |
+
try:
|
| 316 |
+
return grade_task(candidate_code, task, include_hidden=include_hidden)
|
| 317 |
+
except Exception as exc:
|
| 318 |
+
return TaskGrade(
|
| 319 |
+
score=0.0,
|
| 320 |
+
syntax_score=0.0,
|
| 321 |
+
tests_passed=0,
|
| 322 |
+
tests_total=max(len(task.visible_tests), 1),
|
| 323 |
+
details={"compile_error": "", "error": _safe_text(exc, "grading_failed")},
|
| 324 |
+
)
|
| 325 |
+
|
| 326 |
+
def _metrics_from_grade(self, grade: TaskGrade) -> dict[str, float]:
|
| 327 |
+
"""Derive normalized reward metrics from a grading result."""
|
| 328 |
+
tests_total = max(int(grade.tests_total), 0)
|
| 329 |
+
tests_passed = max(int(grade.tests_passed), 0)
|
| 330 |
+
test_fraction = (tests_passed / tests_total) if tests_total else _clamp(grade.syntax_score)
|
| 331 |
+
return {
|
| 332 |
+
"score": _clamp(grade.score),
|
| 333 |
+
"test_fraction": _clamp(test_fraction),
|
| 334 |
+
"syntax_score": _clamp(grade.syntax_score),
|
| 335 |
+
"quality_score": _clamp(grade.quality_score),
|
| 336 |
+
}
|
| 337 |
+
|
| 338 |
+
def _format_test_results(self, grade: TaskGrade, include_hidden: bool) -> str:
|
| 339 |
+
"""Format test execution results for the observation."""
|
| 340 |
+
compile_error = _safe_text(grade.details.get("compile_error", ""), "")
|
| 341 |
+
scope = "all checks" if include_hidden else "visible checks"
|
| 342 |
+
if compile_error:
|
| 343 |
+
return f"{scope}: compile error: {compile_error}"
|
| 344 |
+
if grade.timed_out:
|
| 345 |
+
return f"{scope}: execution timed out"
|
| 346 |
+
if self._task and self._task.task_kind == "syntax_fix":
|
| 347 |
+
return "visible checks: code compiles successfully"
|
| 348 |
+
return f"{scope}: {int(grade.tests_passed)}/{int(grade.tests_total)} passing"
|
| 349 |
+
|
| 350 |
+
def _build_status(self, action_type: str, grade: TaskGrade) -> str:
|
| 351 |
+
"""Build a human-readable status message."""
|
| 352 |
+
if action_type == "submit_solution":
|
| 353 |
+
return f"Solution submitted. Final score: {_clamp(grade.score):.3f}"
|
| 354 |
+
if action_type == "edit_code":
|
| 355 |
+
if grade.details.get("compile_error"):
|
| 356 |
+
return "Code updated, but syntax issues remain."
|
| 357 |
+
return "Code updated and evaluated."
|
| 358 |
+
if action_type == "run_tests":
|
| 359 |
+
return "Test run completed."
|
| 360 |
+
if action_type == "analyze_code":
|
| 361 |
+
return "Analysis completed."
|
| 362 |
+
return "Action handled safely."
|
| 363 |
+
|
| 364 |
+
def _apply_grade_to_state(self, grade: TaskGrade, include_hidden: bool) -> None:
|
| 365 |
+
"""Update environment state from the latest grading result."""
|
| 366 |
+
compile_error = _safe_text(grade.details.get("compile_error", ""), "")
|
| 367 |
+
self._state.score = _clamp(grade.score)
|
| 368 |
+
self._state.errors = compile_error
|
| 369 |
+
self._state.test_results = self._format_test_results(grade, include_hidden=include_hidden)
|
| 370 |
+
|
| 371 |
+
def _handle_scored_action(self, action_type: str, candidate_code: str, include_hidden: bool) -> None:
|
| 372 |
+
"""Grade code, update state, and compute reward for a valid action."""
|
| 373 |
+
task = self._task or self._select_task(None)
|
| 374 |
+
previous_metrics = dict(self._metrics)
|
| 375 |
+
prior_code = self._state.current_code
|
| 376 |
+
code_changed = candidate_code.strip() != prior_code.strip()
|
| 377 |
+
if action_type == "edit_code":
|
| 378 |
+
self._state.current_code = candidate_code
|
| 379 |
+
grade = self._safe_grade(task=task, candidate_code=self._state.current_code, include_hidden=include_hidden)
|
| 380 |
+
current_metrics = self._metrics_from_grade(grade)
|
| 381 |
+
self._apply_grade_to_state(grade, include_hidden=include_hidden)
|
| 382 |
+
self._last_reward = self.compute_reward(
|
| 383 |
+
action_type=action_type,
|
| 384 |
+
previous_metrics=previous_metrics,
|
| 385 |
+
current_metrics=current_metrics,
|
| 386 |
+
grade=grade,
|
| 387 |
+
code_changed=code_changed,
|
| 388 |
+
invalid_action=False,
|
| 389 |
+
)
|
| 390 |
+
self._last_status = self._build_status(action_type, grade)
|
| 391 |
+
self._metrics = current_metrics
|
| 392 |
+
self._last_action_type = action_type
|
| 393 |
+
self._append_history(action_type, self._last_status, self._last_reward.value)
|
| 394 |
+
|
| 395 |
+
def _handle_edit(self, code: Optional[str]) -> None:
|
| 396 |
+
"""Validate edit input and evaluate the new candidate code."""
|
| 397 |
+
safe_code = (code or "").strip()
|
| 398 |
+
if not safe_code:
|
| 399 |
+
self._apply_invalid_action("edit_code requires code parameter.")
|
| 400 |
+
return
|
| 401 |
+
self._handle_scored_action(action_type="edit_code", candidate_code=safe_code, include_hidden=False)
|
| 402 |
+
|
| 403 |
+
def _apply_invalid_action(self, reason: str) -> None:
|
| 404 |
+
"""Record an invalid action without crashing the episode."""
|
| 405 |
+
previous_metrics = dict(self._metrics)
|
| 406 |
+
grade = TaskGrade(score=previous_metrics["score"], syntax_score=previous_metrics["syntax_score"])
|
| 407 |
+
self._last_reward = self.compute_reward(
|
| 408 |
+
action_type="invalid",
|
| 409 |
+
previous_metrics=previous_metrics,
|
| 410 |
+
current_metrics=previous_metrics,
|
| 411 |
+
grade=grade,
|
| 412 |
+
code_changed=False,
|
| 413 |
+
invalid_action=True,
|
| 414 |
+
)
|
| 415 |
+
self._last_status = reason
|
| 416 |
+
self._append_history("analyze_code", reason, self._last_reward.value)
|
| 417 |
+
|
| 418 |
+
def _auto_submit(self) -> None:
|
| 419 |
+
"""Finalize the episode when attempts are exhausted."""
|
| 420 |
+
task = self._task or self._select_task(None)
|
| 421 |
+
grade = self._safe_grade(task=task, candidate_code=self._state.current_code, include_hidden=True)
|
| 422 |
+
self._apply_grade_to_state(grade, include_hidden=True)
|
| 423 |
+
self._done = True
|
| 424 |
+
self._state.done = True
|
| 425 |
+
self._last_status = f"Auto-submitted. Final score: {_clamp(grade.score):.3f}"
|
| 426 |
+
|
| 427 |
+
def _append_history(self, action_type: str, status: str, reward: float) -> None:
|
| 428 |
+
"""Append one action record to the episode history."""
|
| 429 |
+
try:
|
| 430 |
+
stable_action = action_type if action_type in VALID_ACTIONS else "analyze_code"
|
| 431 |
+
self._state.history.append(
|
| 432 |
+
HistoryEntry(
|
| 433 |
+
step=max(int(self._state.step_count), 0),
|
| 434 |
+
action_type=stable_action,
|
| 435 |
+
status=_safe_text(status, "handled"),
|
| 436 |
+
reward=float(reward),
|
| 437 |
+
)
|
| 438 |
+
)
|
| 439 |
+
except Exception:
|
| 440 |
+
pass
|
| 441 |
+
|
| 442 |
+
def _build_observation(self) -> PythonCodeReviewObservation:
|
| 443 |
+
"""Build a valid observation from current state."""
|
| 444 |
+
task = self._task
|
| 445 |
+
try:
|
| 446 |
+
return PythonCodeReviewObservation(
|
| 447 |
+
task_id=self._state.task_id or "",
|
| 448 |
+
title=task.title if task else "",
|
| 449 |
+
difficulty=self._state.difficulty or "easy",
|
| 450 |
+
task_kind=self._state.task_kind,
|
| 451 |
+
task_description=task.task_description if task else "",
|
| 452 |
+
current_code=self._state.current_code,
|
| 453 |
+
errors=self._state.errors,
|
| 454 |
+
test_results=self._state.test_results,
|
| 455 |
+
visible_tests=list(task.visible_tests) if task else [],
|
| 456 |
+
history=list(self._state.history),
|
| 457 |
+
attempts_remaining=max(int(self._state.attempts_remaining), 0),
|
| 458 |
+
last_action_status=self._last_status,
|
| 459 |
+
score=_clamp(self._state.score),
|
| 460 |
+
reward_details=self._last_reward,
|
| 461 |
+
reward=self._last_reward.value,
|
| 462 |
+
done=bool(self._state.done),
|
| 463 |
+
metadata={
|
| 464 |
+
"prev_score": self._last_reward.prev_score,
|
| 465 |
+
"curr_score": self._last_reward.curr_score,
|
| 466 |
+
},
|
| 467 |
+
)
|
| 468 |
+
except Exception as exc:
|
| 469 |
+
return PythonCodeReviewObservation(
|
| 470 |
+
task_id=self._state.task_id or "",
|
| 471 |
+
title="",
|
| 472 |
+
difficulty="easy",
|
| 473 |
+
task_kind=None,
|
| 474 |
+
task_description="",
|
| 475 |
+
current_code=getattr(self._state, "current_code", ""),
|
| 476 |
+
errors=_safe_text(exc, "observation_build_failed"),
|
| 477 |
+
test_results="visible checks: unavailable",
|
| 478 |
+
visible_tests=[],
|
| 479 |
+
history=[],
|
| 480 |
+
attempts_remaining=0,
|
| 481 |
+
last_action_status="Observation fallback returned safely.",
|
| 482 |
+
score=0.0,
|
| 483 |
+
reward_details=RewardDetails(value=0.0, reason="Observation fallback."),
|
| 484 |
+
reward=0.0,
|
| 485 |
+
done=bool(getattr(self._state, "done", False)),
|
| 486 |
+
metadata={},
|
| 487 |
+
)
|
| 488 |
+
|
| 489 |
+
|
| 490 |
+
PythonEnvironment = PythonCodeReviewEnvironment
|
| 491 |
+
CodeReviewEnvironment = PythonCodeReviewEnvironment
|
| 492 |
+
|
server/grading.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Deterministic grading helpers for PR-review tasks."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import re
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
from typing import Iterable, List, Optional, Sequence, Set
|
| 8 |
+
|
| 9 |
+
try:
|
| 10 |
+
from models import ReviewFinding, TaskGrade
|
| 11 |
+
from server.task_bank import RubricIssue, TaskSpec
|
| 12 |
+
except ModuleNotFoundError: # pragma: no cover
|
| 13 |
+
from ..models import ReviewFinding, TaskGrade
|
| 14 |
+
from .task_bank import RubricIssue, TaskSpec
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
FALSE_POSITIVE_PENALTY = 0.10
|
| 18 |
+
DUPLICATE_PENALTY = 0.05
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@dataclass(frozen=True)
|
| 22 |
+
class FindingMatch:
|
| 23 |
+
"""Result of matching one finding against the rubric."""
|
| 24 |
+
|
| 25 |
+
issue_id: Optional[str]
|
| 26 |
+
duplicate: bool = False
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def finding_fingerprint(finding: ReviewFinding) -> str:
|
| 30 |
+
"""Build a deterministic fingerprint for duplicate detection."""
|
| 31 |
+
|
| 32 |
+
text = " ".join(
|
| 33 |
+
[
|
| 34 |
+
finding.file_path,
|
| 35 |
+
str(finding.line or 0),
|
| 36 |
+
finding.category,
|
| 37 |
+
finding.severity,
|
| 38 |
+
finding.title,
|
| 39 |
+
finding.explanation,
|
| 40 |
+
finding.suggested_fix,
|
| 41 |
+
]
|
| 42 |
+
)
|
| 43 |
+
return "|".join(sorted(tokens(text)))
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def match_finding(
|
| 47 |
+
finding: ReviewFinding,
|
| 48 |
+
task: TaskSpec,
|
| 49 |
+
matched_issue_ids: Set[str],
|
| 50 |
+
seen_fingerprints: Set[str],
|
| 51 |
+
) -> FindingMatch:
|
| 52 |
+
"""Match one finding against the remaining rubric issues."""
|
| 53 |
+
|
| 54 |
+
fingerprint = finding_fingerprint(finding)
|
| 55 |
+
if fingerprint in seen_fingerprints:
|
| 56 |
+
return FindingMatch(issue_id=None, duplicate=True)
|
| 57 |
+
|
| 58 |
+
for issue in task.rubric_issues:
|
| 59 |
+
if issue.issue_id in matched_issue_ids:
|
| 60 |
+
continue
|
| 61 |
+
if finding_matches_issue(finding, issue):
|
| 62 |
+
return FindingMatch(issue_id=issue.issue_id)
|
| 63 |
+
return FindingMatch(issue_id=None)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def finding_matches_issue(finding: ReviewFinding, issue: RubricIssue) -> bool:
|
| 67 |
+
"""Return True when a finding deterministically matches a rubric issue."""
|
| 68 |
+
|
| 69 |
+
if finding.file_path != issue.file_path:
|
| 70 |
+
return False
|
| 71 |
+
if finding.category != issue.category:
|
| 72 |
+
return False
|
| 73 |
+
if finding.severity != issue.severity:
|
| 74 |
+
return False
|
| 75 |
+
if finding.line is None or abs(finding.line - issue.line) > 2:
|
| 76 |
+
return False
|
| 77 |
+
|
| 78 |
+
finding_tokens = tokens(
|
| 79 |
+
" ".join([finding.title, finding.explanation, finding.suggested_fix])
|
| 80 |
+
)
|
| 81 |
+
keyword_hits = sum(1 for keyword in issue.keywords if keyword in finding_tokens)
|
| 82 |
+
return keyword_hits >= issue.min_keyword_hits
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def score_task(
|
| 86 |
+
task: TaskSpec,
|
| 87 |
+
matched_issue_ids: Iterable[str],
|
| 88 |
+
false_positives: int = 0,
|
| 89 |
+
duplicate_findings: int = 0,
|
| 90 |
+
) -> TaskGrade:
|
| 91 |
+
"""Score a task from cumulative episode state."""
|
| 92 |
+
|
| 93 |
+
matched_set = set(matched_issue_ids)
|
| 94 |
+
matched_weight = sum(
|
| 95 |
+
issue.weight for issue in task.rubric_issues if issue.issue_id in matched_set
|
| 96 |
+
)
|
| 97 |
+
raw_score = matched_weight
|
| 98 |
+
raw_score -= false_positives * FALSE_POSITIVE_PENALTY
|
| 99 |
+
raw_score -= duplicate_findings * DUPLICATE_PENALTY
|
| 100 |
+
score = max(0.0, min(1.0, round(raw_score, 6)))
|
| 101 |
+
return TaskGrade(
|
| 102 |
+
score=score,
|
| 103 |
+
matched_issue_ids=sorted(matched_set),
|
| 104 |
+
false_positives=false_positives,
|
| 105 |
+
duplicate_findings=duplicate_findings,
|
| 106 |
+
matched_weight=min(1.0, round(matched_weight, 6)),
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def grade_findings(task: TaskSpec, findings: Sequence[ReviewFinding]) -> TaskGrade:
|
| 111 |
+
"""Offline-grade a batch of findings for one task."""
|
| 112 |
+
|
| 113 |
+
matched_issue_ids: Set[str] = set()
|
| 114 |
+
seen_fingerprints: Set[str] = set()
|
| 115 |
+
false_positives = 0
|
| 116 |
+
duplicate_findings = 0
|
| 117 |
+
|
| 118 |
+
for finding in findings:
|
| 119 |
+
result = match_finding(
|
| 120 |
+
finding=finding,
|
| 121 |
+
task=task,
|
| 122 |
+
matched_issue_ids=matched_issue_ids,
|
| 123 |
+
seen_fingerprints=seen_fingerprints,
|
| 124 |
+
)
|
| 125 |
+
fingerprint = finding_fingerprint(finding)
|
| 126 |
+
if result.duplicate:
|
| 127 |
+
duplicate_findings += 1
|
| 128 |
+
continue
|
| 129 |
+
seen_fingerprints.add(fingerprint)
|
| 130 |
+
if result.issue_id is None:
|
| 131 |
+
false_positives += 1
|
| 132 |
+
continue
|
| 133 |
+
matched_issue_ids.add(result.issue_id)
|
| 134 |
+
|
| 135 |
+
return score_task(
|
| 136 |
+
task=task,
|
| 137 |
+
matched_issue_ids=matched_issue_ids,
|
| 138 |
+
false_positives=false_positives,
|
| 139 |
+
duplicate_findings=duplicate_findings,
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def tokens(text: str) -> Set[str]:
|
| 144 |
+
"""Normalize free text into deterministic comparison tokens."""
|
| 145 |
+
|
| 146 |
+
return set(re.findall(r"[a-z0-9_]+", text.lower()))
|
| 147 |
+
|
server/python_env_environment.py
CHANGED
|
@@ -1,421 +1,9 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
#
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
from dataclasses import dataclass
|
| 12 |
-
from datetime import UTC, datetime
|
| 13 |
-
from typing import Dict, Iterable, List, Optional
|
| 14 |
-
from uuid import uuid4
|
| 15 |
-
|
| 16 |
-
from openenv.core.env_server.interfaces import Environment
|
| 17 |
-
from openenv.core.env_server.types import State
|
| 18 |
-
|
| 19 |
-
try:
|
| 20 |
-
from ..models import (
|
| 21 |
-
PythonAction,
|
| 22 |
-
PythonEnvConfig,
|
| 23 |
-
PythonObservation,
|
| 24 |
-
ReviewFinding,
|
| 25 |
-
TaskDescriptor,
|
| 26 |
-
TaskEvaluation,
|
| 27 |
-
)
|
| 28 |
-
except ImportError:
|
| 29 |
-
from models import ( # type: ignore
|
| 30 |
-
PythonAction,
|
| 31 |
-
PythonEnvConfig,
|
| 32 |
-
PythonObservation,
|
| 33 |
-
ReviewFinding,
|
| 34 |
-
TaskDescriptor,
|
| 35 |
-
TaskEvaluation,
|
| 36 |
-
)
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
@dataclass(frozen=True)
|
| 40 |
-
class ReferenceFinding:
|
| 41 |
-
"""Hidden finding metadata used for deterministic grading."""
|
| 42 |
-
|
| 43 |
-
rule_id: str
|
| 44 |
-
title: str
|
| 45 |
-
line: int
|
| 46 |
-
category: str
|
| 47 |
-
severity: str
|
| 48 |
-
rationale: str
|
| 49 |
-
recommendation: str
|
| 50 |
-
weight: float
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
@dataclass(frozen=True)
|
| 54 |
-
class ReviewTask:
|
| 55 |
-
"""A visible task plus its hidden grading references."""
|
| 56 |
-
|
| 57 |
-
descriptor: TaskDescriptor
|
| 58 |
-
references: tuple[ReferenceFinding, ...]
|
| 59 |
-
hint: str
|
| 60 |
-
patched_code: Optional[str] = None
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
TASK_BANK: Dict[str, ReviewTask] = {
|
| 64 |
-
"py-review-easy": ReviewTask(
|
| 65 |
-
descriptor=TaskDescriptor(
|
| 66 |
-
task_id="py-review-easy",
|
| 67 |
-
difficulty="easy",
|
| 68 |
-
title="Mutable default argument",
|
| 69 |
-
objective="Find the correctness issue and explain a safe fix.",
|
| 70 |
-
code=(
|
| 71 |
-
"def add_tag(tag, tags=[]):\n"
|
| 72 |
-
" tags.append(tag)\n"
|
| 73 |
-
" return tags\n"
|
| 74 |
-
),
|
| 75 |
-
max_steps=4,
|
| 76 |
-
success_threshold=0.7,
|
| 77 |
-
),
|
| 78 |
-
references=(
|
| 79 |
-
ReferenceFinding(
|
| 80 |
-
rule_id="mutable-default",
|
| 81 |
-
title="Mutable default list is shared across calls",
|
| 82 |
-
line=1,
|
| 83 |
-
category="bug",
|
| 84 |
-
severity="warning",
|
| 85 |
-
rationale="The list persists between calls and leaks state.",
|
| 86 |
-
recommendation="Use None as the default and create a new list inside the function.",
|
| 87 |
-
weight=1.0,
|
| 88 |
-
),
|
| 89 |
-
),
|
| 90 |
-
hint="Look for state that survives between separate function calls.",
|
| 91 |
-
patched_code=(
|
| 92 |
-
"def add_tag(tag, tags=None):\n"
|
| 93 |
-
" if tags is None:\n"
|
| 94 |
-
" tags = []\n"
|
| 95 |
-
" tags.append(tag)\n"
|
| 96 |
-
" return tags\n"
|
| 97 |
-
),
|
| 98 |
-
),
|
| 99 |
-
"py-review-medium": ReviewTask(
|
| 100 |
-
descriptor=TaskDescriptor(
|
| 101 |
-
task_id="py-review-medium",
|
| 102 |
-
difficulty="medium",
|
| 103 |
-
title="Unsafe shell invocation",
|
| 104 |
-
objective="Review the snippet for security-sensitive behavior.",
|
| 105 |
-
code=(
|
| 106 |
-
"import os\n\n"
|
| 107 |
-
"def run_backup(path):\n"
|
| 108 |
-
" os.system(f\"tar -czf backup.tgz {path}\")\n"
|
| 109 |
-
),
|
| 110 |
-
max_steps=4,
|
| 111 |
-
success_threshold=0.72,
|
| 112 |
-
),
|
| 113 |
-
references=(
|
| 114 |
-
ReferenceFinding(
|
| 115 |
-
rule_id="shell-injection",
|
| 116 |
-
title="User input is interpolated into a shell command",
|
| 117 |
-
line=4,
|
| 118 |
-
category="security",
|
| 119 |
-
severity="critical",
|
| 120 |
-
rationale="An attacker can inject shell metacharacters through the path argument.",
|
| 121 |
-
recommendation="Use subprocess with an argument list instead of os.system.",
|
| 122 |
-
weight=1.0,
|
| 123 |
-
),
|
| 124 |
-
),
|
| 125 |
-
hint="Check how external commands are invoked and whether user input is escaped.",
|
| 126 |
-
patched_code=(
|
| 127 |
-
"import subprocess\n\n"
|
| 128 |
-
"def run_backup(path):\n"
|
| 129 |
-
" subprocess.run([\"tar\", \"-czf\", \"backup.tgz\", path], check=True)\n"
|
| 130 |
-
),
|
| 131 |
-
),
|
| 132 |
-
"py-review-hard": ReviewTask(
|
| 133 |
-
descriptor=TaskDescriptor(
|
| 134 |
-
task_id="py-review-hard",
|
| 135 |
-
difficulty="hard",
|
| 136 |
-
title="Retry helper hides failures",
|
| 137 |
-
objective="Identify correctness and maintainability issues in the retry logic.",
|
| 138 |
-
code=(
|
| 139 |
-
"import time\n\n"
|
| 140 |
-
"def fetch_with_retry(client, url, retries=3):\n"
|
| 141 |
-
" last_error = None\n"
|
| 142 |
-
" for _ in range(retries):\n"
|
| 143 |
-
" try:\n"
|
| 144 |
-
" return client.get(url, timeout=1)\n"
|
| 145 |
-
" except Exception as exc:\n"
|
| 146 |
-
" last_error = exc\n"
|
| 147 |
-
" time.sleep(0.1)\n"
|
| 148 |
-
" return None\n"
|
| 149 |
-
),
|
| 150 |
-
max_steps=4,
|
| 151 |
-
success_threshold=0.74,
|
| 152 |
-
),
|
| 153 |
-
references=(
|
| 154 |
-
ReferenceFinding(
|
| 155 |
-
rule_id="swallowed-error",
|
| 156 |
-
title="Function swallows the final exception and returns None",
|
| 157 |
-
line=10,
|
| 158 |
-
category="bug",
|
| 159 |
-
severity="warning",
|
| 160 |
-
rationale="Callers cannot distinguish a failed request from a valid None result.",
|
| 161 |
-
recommendation="Re-raise the last exception after retries are exhausted.",
|
| 162 |
-
weight=0.65,
|
| 163 |
-
),
|
| 164 |
-
ReferenceFinding(
|
| 165 |
-
rule_id="broad-except",
|
| 166 |
-
title="Broad exception handler catches unexpected failures",
|
| 167 |
-
line=7,
|
| 168 |
-
category="maintainability",
|
| 169 |
-
severity="info",
|
| 170 |
-
rationale="Catching Exception masks programming errors and interrupts.",
|
| 171 |
-
recommendation="Catch only the client or network exceptions you expect to retry.",
|
| 172 |
-
weight=0.35,
|
| 173 |
-
),
|
| 174 |
-
),
|
| 175 |
-
hint="Consider what happens to the final error after the retry loop finishes.",
|
| 176 |
-
patched_code=(
|
| 177 |
-
"import time\n\n"
|
| 178 |
-
"def fetch_with_retry(client, url, retries=3):\n"
|
| 179 |
-
" last_error = None\n"
|
| 180 |
-
" for _ in range(retries):\n"
|
| 181 |
-
" try:\n"
|
| 182 |
-
" return client.get(url, timeout=1)\n"
|
| 183 |
-
" except client.retryable_exceptions as exc:\n"
|
| 184 |
-
" last_error = exc\n"
|
| 185 |
-
" time.sleep(0.1)\n"
|
| 186 |
-
" if last_error is not None:\n"
|
| 187 |
-
" raise last_error\n"
|
| 188 |
-
),
|
| 189 |
-
),
|
| 190 |
-
}
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
def _utc_now() -> str:
|
| 194 |
-
return datetime.now(UTC).isoformat()
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
def _normalize_text(value: Optional[str]) -> str:
|
| 198 |
-
return " ".join((value or "").strip().lower().split())
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
def _normalize_code(value: Optional[str]) -> str:
|
| 202 |
-
return "\n".join(line.rstrip() for line in (value or "").strip().splitlines())
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
class PythonEnvironment(Environment[PythonAction, PythonObservation, State]):
|
| 206 |
-
"""Deterministic benchmark environment for Python code review tasks."""
|
| 207 |
-
|
| 208 |
-
SUPPORTS_CONCURRENT_SESSIONS: bool = True
|
| 209 |
-
|
| 210 |
-
def __init__(self, config: Optional[PythonEnvConfig] = None):
|
| 211 |
-
super().__init__()
|
| 212 |
-
self._config = config or PythonEnvConfig()
|
| 213 |
-
self._state = State(episode_id=str(uuid4()), step_count=0)
|
| 214 |
-
self._task_cursor = -1
|
| 215 |
-
self._current_task: Optional[ReviewTask] = None
|
| 216 |
-
self._submitted_findings: List[ReviewFinding] = []
|
| 217 |
-
self._hints_used = 0
|
| 218 |
-
self._created_at = _utc_now()
|
| 219 |
-
|
| 220 |
-
def reset(
|
| 221 |
-
self,
|
| 222 |
-
seed: Optional[int] = None,
|
| 223 |
-
episode_id: Optional[str] = None,
|
| 224 |
-
**kwargs,
|
| 225 |
-
) -> PythonObservation:
|
| 226 |
-
"""Start the next configured review task."""
|
| 227 |
-
|
| 228 |
-
del seed, kwargs
|
| 229 |
-
self._task_cursor = (self._task_cursor + 1) % len(self._config.task_order)
|
| 230 |
-
task_id = self._config.task_order[self._task_cursor]
|
| 231 |
-
self._current_task = TASK_BANK.get(task_id, TASK_BANK["py-review-easy"])
|
| 232 |
-
self._state = State(
|
| 233 |
-
episode_id=episode_id or str(uuid4()),
|
| 234 |
-
step_count=0,
|
| 235 |
-
)
|
| 236 |
-
self._submitted_findings = []
|
| 237 |
-
self._hints_used = 0
|
| 238 |
-
self._created_at = _utc_now()
|
| 239 |
-
return self._build_observation(
|
| 240 |
-
feedback="New review task loaded. Submit findings or request a hint.",
|
| 241 |
-
reward=0.0,
|
| 242 |
-
done=False,
|
| 243 |
-
)
|
| 244 |
-
|
| 245 |
-
def step(
|
| 246 |
-
self,
|
| 247 |
-
action: PythonAction,
|
| 248 |
-
timeout_s: Optional[float] = None,
|
| 249 |
-
**kwargs,
|
| 250 |
-
) -> PythonObservation:
|
| 251 |
-
"""Process one review action and return updated feedback."""
|
| 252 |
-
|
| 253 |
-
del timeout_s, kwargs
|
| 254 |
-
if self._current_task is None:
|
| 255 |
-
return self.reset()
|
| 256 |
-
|
| 257 |
-
self._state.step_count += 1
|
| 258 |
-
operation = action.operation
|
| 259 |
-
feedback = ""
|
| 260 |
-
reward = 0.0
|
| 261 |
-
done = False
|
| 262 |
-
|
| 263 |
-
if operation == "request_hint":
|
| 264 |
-
self._hints_used += 1
|
| 265 |
-
feedback = self._current_task.hint
|
| 266 |
-
evaluation = self._evaluate(self._submitted_findings, action.patched_code)
|
| 267 |
-
reward = evaluation.score
|
| 268 |
-
else:
|
| 269 |
-
if action.findings:
|
| 270 |
-
self._submitted_findings.extend(action.findings)
|
| 271 |
-
evaluation = self._evaluate(self._submitted_findings, action.patched_code)
|
| 272 |
-
reward = evaluation.score
|
| 273 |
-
if operation == "finalize":
|
| 274 |
-
done = True
|
| 275 |
-
feedback = (
|
| 276 |
-
"Review finalized. "
|
| 277 |
-
f"Matched {evaluation.matched_findings}/{evaluation.total_findings} "
|
| 278 |
-
"reference findings."
|
| 279 |
-
)
|
| 280 |
-
else:
|
| 281 |
-
feedback = (
|
| 282 |
-
f"Progress saved. Matched {evaluation.matched_findings}/"
|
| 283 |
-
f"{evaluation.total_findings} findings with score {evaluation.score:.2f}."
|
| 284 |
-
)
|
| 285 |
-
|
| 286 |
-
if self._state.step_count >= self._max_steps():
|
| 287 |
-
done = True
|
| 288 |
-
if operation != "finalize":
|
| 289 |
-
feedback = (
|
| 290 |
-
f"{feedback} Maximum steps reached."
|
| 291 |
-
if feedback
|
| 292 |
-
else "Maximum steps reached."
|
| 293 |
-
)
|
| 294 |
-
|
| 295 |
-
return self._build_observation(
|
| 296 |
-
feedback=feedback,
|
| 297 |
-
reward=reward,
|
| 298 |
-
done=done,
|
| 299 |
-
patched_code=action.patched_code,
|
| 300 |
-
)
|
| 301 |
-
|
| 302 |
-
def _build_observation(
|
| 303 |
-
self,
|
| 304 |
-
*,
|
| 305 |
-
feedback: str,
|
| 306 |
-
reward: float,
|
| 307 |
-
done: bool,
|
| 308 |
-
patched_code: Optional[str] = None,
|
| 309 |
-
) -> PythonObservation:
|
| 310 |
-
assert self._current_task is not None
|
| 311 |
-
evaluation = self._evaluate(self._submitted_findings, patched_code)
|
| 312 |
-
attempts_remaining = max(
|
| 313 |
-
self._max_steps() - self._state.step_count,
|
| 314 |
-
0,
|
| 315 |
-
)
|
| 316 |
-
return PythonObservation(
|
| 317 |
-
task=self._current_task.descriptor,
|
| 318 |
-
feedback=feedback,
|
| 319 |
-
submitted_findings=list(self._submitted_findings),
|
| 320 |
-
hints_used=self._hints_used,
|
| 321 |
-
attempts_remaining=attempts_remaining,
|
| 322 |
-
evaluation=evaluation,
|
| 323 |
-
score=evaluation.score,
|
| 324 |
-
review_time_ms=float(self._state.step_count * 125),
|
| 325 |
-
done=done,
|
| 326 |
-
reward=reward,
|
| 327 |
-
metadata={
|
| 328 |
-
"episode_id": self._state.episode_id,
|
| 329 |
-
"created_at": self._created_at,
|
| 330 |
-
"updated_at": _utc_now(),
|
| 331 |
-
},
|
| 332 |
-
)
|
| 333 |
-
|
| 334 |
-
def _evaluate(
|
| 335 |
-
self,
|
| 336 |
-
findings: Iterable[ReviewFinding],
|
| 337 |
-
patched_code: Optional[str],
|
| 338 |
-
) -> TaskEvaluation:
|
| 339 |
-
assert self._current_task is not None
|
| 340 |
-
|
| 341 |
-
references = self._current_task.references
|
| 342 |
-
matched_reference_ids: List[str] = []
|
| 343 |
-
matched_weight = 0.0
|
| 344 |
-
false_positives = 0
|
| 345 |
-
duplicate_findings = 0
|
| 346 |
-
|
| 347 |
-
seen_ids = set()
|
| 348 |
-
for finding in findings:
|
| 349 |
-
ref_id = self._match_reference(finding, references)
|
| 350 |
-
if ref_id is None:
|
| 351 |
-
false_positives += 1
|
| 352 |
-
continue
|
| 353 |
-
if ref_id in seen_ids:
|
| 354 |
-
duplicate_findings += 1
|
| 355 |
-
continue
|
| 356 |
-
seen_ids.add(ref_id)
|
| 357 |
-
matched_reference_ids.append(ref_id)
|
| 358 |
-
matched_weight += next(ref.weight for ref in references if ref.rule_id == ref_id)
|
| 359 |
-
|
| 360 |
-
total_weight = sum(ref.weight for ref in references) or 1.0
|
| 361 |
-
weighted_recall = min(matched_weight / total_weight, 1.0)
|
| 362 |
-
|
| 363 |
-
patch_score = 0.0
|
| 364 |
-
if self._current_task.patched_code and patched_code:
|
| 365 |
-
patch_score = float(
|
| 366 |
-
_normalize_code(patched_code) == _normalize_code(self._current_task.patched_code)
|
| 367 |
-
)
|
| 368 |
-
|
| 369 |
-
raw_score = (
|
| 370 |
-
weighted_recall
|
| 371 |
-
+ (self._config.patch_bonus_multiplier * patch_score)
|
| 372 |
-
- (self._config.false_positive_penalty * false_positives)
|
| 373 |
-
- (self._config.duplicate_penalty * duplicate_findings)
|
| 374 |
-
- (self._config.hint_penalty * self._hints_used)
|
| 375 |
-
)
|
| 376 |
-
score = max(0.0, min(raw_score, 1.0))
|
| 377 |
-
|
| 378 |
-
return TaskEvaluation(
|
| 379 |
-
matched_reference_ids=matched_reference_ids,
|
| 380 |
-
matched_findings=len(matched_reference_ids),
|
| 381 |
-
total_findings=len(references),
|
| 382 |
-
false_positives=false_positives,
|
| 383 |
-
duplicate_findings=duplicate_findings,
|
| 384 |
-
weighted_recall=weighted_recall,
|
| 385 |
-
patch_score=patch_score,
|
| 386 |
-
score=score,
|
| 387 |
-
passed=score >= self._current_task.descriptor.success_threshold,
|
| 388 |
-
)
|
| 389 |
-
|
| 390 |
-
def _match_reference(
|
| 391 |
-
self,
|
| 392 |
-
finding: ReviewFinding,
|
| 393 |
-
references: Iterable[ReferenceFinding],
|
| 394 |
-
) -> Optional[str]:
|
| 395 |
-
finding_rule = _normalize_text(finding.rule_id)
|
| 396 |
-
finding_title = _normalize_text(finding.title)
|
| 397 |
-
for reference in references:
|
| 398 |
-
if finding_rule and finding_rule == _normalize_text(reference.rule_id):
|
| 399 |
-
return reference.rule_id
|
| 400 |
-
line_matches = finding.line is not None and finding.line == reference.line
|
| 401 |
-
category_matches = finding.category == reference.category
|
| 402 |
-
title_matches = finding_title and (
|
| 403 |
-
finding_title in _normalize_text(reference.title)
|
| 404 |
-
or _normalize_text(reference.title) in finding_title
|
| 405 |
-
)
|
| 406 |
-
if line_matches and (category_matches or title_matches):
|
| 407 |
-
return reference.rule_id
|
| 408 |
-
return None
|
| 409 |
-
|
| 410 |
-
def _max_steps(self) -> int:
|
| 411 |
-
assert self._current_task is not None
|
| 412 |
-
return min(
|
| 413 |
-
self._current_task.descriptor.max_steps,
|
| 414 |
-
self._config.max_steps_per_task,
|
| 415 |
-
)
|
| 416 |
-
|
| 417 |
-
@property
|
| 418 |
-
def state(self) -> State:
|
| 419 |
-
"""Return the current environment state."""
|
| 420 |
-
|
| 421 |
-
return self._state
|
|
|
|
| 1 |
+
"""Compatibility shim for older imports."""
|
| 2 |
+
|
| 3 |
+
try:
|
| 4 |
+
from server.code_review_environment import PythonEnvironment
|
| 5 |
+
except ModuleNotFoundError: # pragma: no cover
|
| 6 |
+
from .code_review_environment import PythonEnvironment
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
__all__ = ["PythonEnvironment"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
server/requirements.txt
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
-
openenv[core]>=0.2.
|
| 2 |
-
fastapi>=0.115.0
|
| 3 |
-
uvicorn>=0.
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
|
|
|
| 1 |
+
openenv-core[core]>=0.2.2
|
| 2 |
+
fastapi>=0.115.0
|
| 3 |
+
uvicorn[standard]>=0.30.0
|
| 4 |
+
openai>=1.40.0
|
| 5 |
+
pytest>=8.0.0
|
| 6 |
+
pydantic>=2.0.0
|
server/static_review.py
ADDED
|
@@ -0,0 +1,273 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Deterministic static-review helpers for arbitrary Python code.
|
| 2 |
+
|
| 3 |
+
Unlike the benchmark grader, this module does not compare against hidden rubric
|
| 4 |
+
items. Instead, it performs direct AST-based review on arbitrary snippets so it
|
| 5 |
+
can be used for manual testing, examples, and future dataset generation.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import ast
|
| 11 |
+
from typing import List, Optional
|
| 12 |
+
|
| 13 |
+
try:
|
| 14 |
+
from models import DirectReviewResponse, ReviewFinding
|
| 15 |
+
except ModuleNotFoundError: # pragma: no cover
|
| 16 |
+
from ..models import DirectReviewResponse, ReviewFinding
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class _StaticAnalyzer(ast.NodeVisitor):
|
| 20 |
+
"""AST visitor that emits structured review findings.
|
| 21 |
+
|
| 22 |
+
The visitor intentionally focuses on a small set of high-signal patterns so
|
| 23 |
+
the direct-review endpoint stays predictable and easy to understand.
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
def __init__(self) -> None:
|
| 27 |
+
self.issues: List[ReviewFinding] = []
|
| 28 |
+
|
| 29 |
+
def visit_FunctionDef(self, node: ast.FunctionDef) -> None: # noqa: N802
|
| 30 |
+
"""Flag mutable default arguments in function definitions."""
|
| 31 |
+
|
| 32 |
+
for default in list(node.args.defaults):
|
| 33 |
+
if isinstance(default, (ast.List, ast.Dict, ast.Set)):
|
| 34 |
+
self.issues.append(
|
| 35 |
+
ReviewFinding(
|
| 36 |
+
title="Mutable default argument",
|
| 37 |
+
line=getattr(default, "lineno", node.lineno),
|
| 38 |
+
category="bug",
|
| 39 |
+
severity="warning",
|
| 40 |
+
rationale=(
|
| 41 |
+
"Mutable defaults persist across calls and can leak state "
|
| 42 |
+
"between unrelated requests."
|
| 43 |
+
),
|
| 44 |
+
recommendation="Use None as the default and create the object inside the function.",
|
| 45 |
+
rule_id="mutable-default-list",
|
| 46 |
+
)
|
| 47 |
+
)
|
| 48 |
+
self.generic_visit(node)
|
| 49 |
+
|
| 50 |
+
def visit_Call(self, node: ast.Call) -> None: # noqa: N802
|
| 51 |
+
"""Inspect function calls for obviously unsafe or noisy patterns."""
|
| 52 |
+
|
| 53 |
+
func_name = self._call_name(node)
|
| 54 |
+
if func_name in {"eval", "exec"}:
|
| 55 |
+
self.issues.append(
|
| 56 |
+
ReviewFinding(
|
| 57 |
+
title=f"Avoid {func_name} on untrusted input",
|
| 58 |
+
line=node.lineno,
|
| 59 |
+
category="security",
|
| 60 |
+
severity="critical",
|
| 61 |
+
rationale=(
|
| 62 |
+
f"{func_name} executes arbitrary code and is unsafe on "
|
| 63 |
+
"user-controlled input."
|
| 64 |
+
),
|
| 65 |
+
recommendation="Use a safe parser or a whitelist-based evaluator.",
|
| 66 |
+
rule_id="avoid-eval" if func_name == "eval" else "avoid-exec",
|
| 67 |
+
)
|
| 68 |
+
)
|
| 69 |
+
if func_name.endswith("check_output") or func_name.endswith("run"):
|
| 70 |
+
for keyword in node.keywords:
|
| 71 |
+
# `shell=True` is only a problem when the command comes from a
|
| 72 |
+
# shell-parsed string, but this heuristic is high value for
|
| 73 |
+
# review and intentionally conservative.
|
| 74 |
+
if keyword.arg == "shell" and isinstance(keyword.value, ast.Constant) and keyword.value.value is True:
|
| 75 |
+
self.issues.append(
|
| 76 |
+
ReviewFinding(
|
| 77 |
+
title="shell=True with dynamic input",
|
| 78 |
+
line=node.lineno,
|
| 79 |
+
category="security",
|
| 80 |
+
severity="critical",
|
| 81 |
+
rationale=(
|
| 82 |
+
"shell=True executes through the shell and can allow "
|
| 83 |
+
"command injection when the command string is interpolated."
|
| 84 |
+
),
|
| 85 |
+
recommendation="Pass a list of arguments and keep shell=False.",
|
| 86 |
+
rule_id="shell-true-command-injection",
|
| 87 |
+
)
|
| 88 |
+
)
|
| 89 |
+
if func_name == "print":
|
| 90 |
+
self.issues.append(
|
| 91 |
+
ReviewFinding(
|
| 92 |
+
title="Print statement in application logic",
|
| 93 |
+
line=node.lineno,
|
| 94 |
+
category="style",
|
| 95 |
+
severity="info",
|
| 96 |
+
rationale="Production services should prefer structured logging over print statements.",
|
| 97 |
+
recommendation="Use the logging module or return the value to the caller.",
|
| 98 |
+
rule_id="print-statement",
|
| 99 |
+
)
|
| 100 |
+
)
|
| 101 |
+
self.generic_visit(node)
|
| 102 |
+
|
| 103 |
+
def visit_ExceptHandler(self, node: ast.ExceptHandler) -> None: # noqa: N802
|
| 104 |
+
"""Flag bare exception handlers that hide failures."""
|
| 105 |
+
|
| 106 |
+
if node.type is None:
|
| 107 |
+
self.issues.append(
|
| 108 |
+
ReviewFinding(
|
| 109 |
+
title="Bare except",
|
| 110 |
+
line=node.lineno,
|
| 111 |
+
category="maintainability",
|
| 112 |
+
severity="warning",
|
| 113 |
+
rationale="Bare except catches KeyboardInterrupt and other system-level exceptions.",
|
| 114 |
+
recommendation="Catch a specific exception and record the failure.",
|
| 115 |
+
rule_id="bare-except",
|
| 116 |
+
)
|
| 117 |
+
)
|
| 118 |
+
self.generic_visit(node)
|
| 119 |
+
|
| 120 |
+
def visit_For(self, node: ast.For) -> None: # noqa: N802
|
| 121 |
+
"""Look for list-membership checks nested in loops."""
|
| 122 |
+
|
| 123 |
+
for child in ast.walk(node):
|
| 124 |
+
if isinstance(child, ast.Compare) and any(
|
| 125 |
+
isinstance(operator, (ast.In, ast.NotIn)) for operator in child.ops
|
| 126 |
+
):
|
| 127 |
+
if isinstance(child.comparators[0], ast.Name):
|
| 128 |
+
self.issues.append(
|
| 129 |
+
ReviewFinding(
|
| 130 |
+
title="Potential quadratic membership check inside loop",
|
| 131 |
+
line=child.lineno,
|
| 132 |
+
category="performance",
|
| 133 |
+
severity="warning",
|
| 134 |
+
rationale=(
|
| 135 |
+
"Repeated membership checks against a list inside a loop "
|
| 136 |
+
"can degrade to quadratic runtime."
|
| 137 |
+
),
|
| 138 |
+
recommendation="Use a set or dict for O(1) membership checks.",
|
| 139 |
+
rule_id="quadratic-membership-check",
|
| 140 |
+
)
|
| 141 |
+
)
|
| 142 |
+
break
|
| 143 |
+
self.generic_visit(node)
|
| 144 |
+
|
| 145 |
+
@staticmethod
|
| 146 |
+
def _call_name(node: ast.Call) -> str:
|
| 147 |
+
"""Extract a dotted function name such as `subprocess.run`."""
|
| 148 |
+
|
| 149 |
+
func = node.func
|
| 150 |
+
if isinstance(func, ast.Name):
|
| 151 |
+
return func.id
|
| 152 |
+
if isinstance(func, ast.Attribute):
|
| 153 |
+
prefix = _StaticAnalyzer._attribute_prefix(func.value)
|
| 154 |
+
return f"{prefix}.{func.attr}" if prefix else func.attr
|
| 155 |
+
return ""
|
| 156 |
+
|
| 157 |
+
@staticmethod
|
| 158 |
+
def _attribute_prefix(node: ast.AST) -> str:
|
| 159 |
+
"""Reconstruct the left-hand side of an attribute chain."""
|
| 160 |
+
|
| 161 |
+
if isinstance(node, ast.Name):
|
| 162 |
+
return node.id
|
| 163 |
+
if isinstance(node, ast.Attribute):
|
| 164 |
+
prefix = _StaticAnalyzer._attribute_prefix(node.value)
|
| 165 |
+
return f"{prefix}.{node.attr}" if prefix else node.attr
|
| 166 |
+
return ""
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def analyze_python_code(code: str) -> List[ReviewFinding]:
|
| 170 |
+
"""Analyze arbitrary Python code and return structured findings."""
|
| 171 |
+
|
| 172 |
+
if not code.strip():
|
| 173 |
+
return [
|
| 174 |
+
ReviewFinding(
|
| 175 |
+
title="No code provided",
|
| 176 |
+
category="bug",
|
| 177 |
+
severity="warning",
|
| 178 |
+
rationale="The reviewer cannot inspect an empty submission.",
|
| 179 |
+
recommendation="Provide Python source code.",
|
| 180 |
+
rule_id="empty-input",
|
| 181 |
+
)
|
| 182 |
+
]
|
| 183 |
+
|
| 184 |
+
# Syntax errors are turned into findings rather than exceptions so API
|
| 185 |
+
# consumers always get a valid response shape.
|
| 186 |
+
try:
|
| 187 |
+
tree = ast.parse(code)
|
| 188 |
+
except SyntaxError as exc:
|
| 189 |
+
return [
|
| 190 |
+
ReviewFinding(
|
| 191 |
+
title="Syntax error",
|
| 192 |
+
line=exc.lineno,
|
| 193 |
+
category="bug",
|
| 194 |
+
severity="critical",
|
| 195 |
+
rationale=exc.msg,
|
| 196 |
+
recommendation="Fix the syntax error before running static review.",
|
| 197 |
+
rule_id="syntax-error",
|
| 198 |
+
)
|
| 199 |
+
]
|
| 200 |
+
|
| 201 |
+
analyzer = _StaticAnalyzer()
|
| 202 |
+
analyzer.visit(tree)
|
| 203 |
+
return _deduplicate(analyzer.issues)
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
def build_direct_review_response(
|
| 207 |
+
code: str, context: Optional[str] = None
|
| 208 |
+
) -> DirectReviewResponse:
|
| 209 |
+
"""Build the public direct-review response for the `/review` route."""
|
| 210 |
+
|
| 211 |
+
issues = analyze_python_code(code)
|
| 212 |
+
weighted_penalty = 0.0
|
| 213 |
+
# The direct-review score is intentionally simple: more severe issues lower
|
| 214 |
+
# the score more aggressively.
|
| 215 |
+
for issue in issues:
|
| 216 |
+
if issue.severity == "critical":
|
| 217 |
+
weighted_penalty += 0.3
|
| 218 |
+
elif issue.severity == "warning":
|
| 219 |
+
weighted_penalty += 0.15
|
| 220 |
+
else:
|
| 221 |
+
weighted_penalty += 0.05
|
| 222 |
+
|
| 223 |
+
score = max(0.0, min(1.0, 1.0 - weighted_penalty))
|
| 224 |
+
summary = _build_summary(issues, context)
|
| 225 |
+
improved_code = _suggest_improved_code(code, issues)
|
| 226 |
+
return DirectReviewResponse(
|
| 227 |
+
issues=issues,
|
| 228 |
+
summary=summary,
|
| 229 |
+
score=score,
|
| 230 |
+
improved_code=improved_code,
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
def _build_summary(issues: List[ReviewFinding], context: Optional[str]) -> str:
|
| 235 |
+
"""Create a concise human-readable summary for the direct-review response."""
|
| 236 |
+
|
| 237 |
+
if not issues:
|
| 238 |
+
base = "No obvious issues were detected by the deterministic reviewer."
|
| 239 |
+
else:
|
| 240 |
+
critical = sum(1 for issue in issues if issue.severity == "critical")
|
| 241 |
+
warnings = sum(1 for issue in issues if issue.severity == "warning")
|
| 242 |
+
infos = sum(1 for issue in issues if issue.severity == "info")
|
| 243 |
+
base = (
|
| 244 |
+
f"Detected {len(issues)} issue(s): {critical} critical, "
|
| 245 |
+
f"{warnings} warning, {infos} info."
|
| 246 |
+
)
|
| 247 |
+
if context:
|
| 248 |
+
return f"{base} Context: {context}"
|
| 249 |
+
return base
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
def _suggest_improved_code(code: str, issues: List[ReviewFinding]) -> Optional[str]:
|
| 253 |
+
"""Append high-level fix directions to the submitted code."""
|
| 254 |
+
|
| 255 |
+
if not issues:
|
| 256 |
+
return None
|
| 257 |
+
suggestions = [issue.recommendation for issue in issues if issue.recommendation]
|
| 258 |
+
comment = " | ".join(dict.fromkeys(suggestions))
|
| 259 |
+
return f"{code.rstrip()}\n\n# Suggested review directions: {comment}"
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
def _deduplicate(findings: List[ReviewFinding]) -> List[ReviewFinding]:
|
| 263 |
+
"""Drop duplicate findings that refer to the same rule and line."""
|
| 264 |
+
|
| 265 |
+
seen = set()
|
| 266 |
+
unique: List[ReviewFinding] = []
|
| 267 |
+
for finding in findings:
|
| 268 |
+
key = (finding.rule_id, finding.line, finding.category)
|
| 269 |
+
if key in seen:
|
| 270 |
+
continue
|
| 271 |
+
seen.add(key)
|
| 272 |
+
unique.append(finding)
|
| 273 |
+
return unique
|
server/task_bank.py
ADDED
|
@@ -0,0 +1,340 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Static PR-review tasks and hidden grading rubrics."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from dataclasses import dataclass, field
|
| 6 |
+
from typing import Dict, Iterable, List, Sequence
|
| 7 |
+
|
| 8 |
+
try:
|
| 9 |
+
from models import Category, Difficulty, Severity, TaskDescriptor, TaskSummary
|
| 10 |
+
except ModuleNotFoundError: # pragma: no cover
|
| 11 |
+
from ..models import Category, Difficulty, Severity, TaskDescriptor, TaskSummary
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@dataclass(frozen=True)
|
| 15 |
+
class RubricIssue:
|
| 16 |
+
"""One hidden issue that can be matched by the deterministic grader."""
|
| 17 |
+
|
| 18 |
+
issue_id: str
|
| 19 |
+
file_path: str
|
| 20 |
+
line: int
|
| 21 |
+
category: Category
|
| 22 |
+
severity: Severity
|
| 23 |
+
keywords: Sequence[str]
|
| 24 |
+
min_keyword_hits: int
|
| 25 |
+
weight: float
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
@dataclass(frozen=True)
|
| 29 |
+
class TaskSpec:
|
| 30 |
+
"""Complete task definition, including hidden rubric metadata."""
|
| 31 |
+
|
| 32 |
+
task_id: str
|
| 33 |
+
difficulty: Difficulty
|
| 34 |
+
title: str
|
| 35 |
+
goal: str
|
| 36 |
+
repo_summary: str
|
| 37 |
+
visible_diff: str
|
| 38 |
+
file_contents: Dict[str, str]
|
| 39 |
+
changed_files: Sequence[str]
|
| 40 |
+
rubric_issues: Sequence[RubricIssue]
|
| 41 |
+
max_steps: int
|
| 42 |
+
|
| 43 |
+
@property
|
| 44 |
+
def available_files(self) -> List[str]:
|
| 45 |
+
return list(self.file_contents.keys())
|
| 46 |
+
|
| 47 |
+
def to_descriptor(self) -> TaskDescriptor:
|
| 48 |
+
return TaskDescriptor(
|
| 49 |
+
task_id=self.task_id,
|
| 50 |
+
difficulty=self.difficulty,
|
| 51 |
+
title=self.title,
|
| 52 |
+
goal=self.goal,
|
| 53 |
+
repo_summary=self.repo_summary,
|
| 54 |
+
changed_files=list(self.changed_files),
|
| 55 |
+
available_files=self.available_files,
|
| 56 |
+
max_steps=self.max_steps,
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
def to_summary(self) -> TaskSummary:
|
| 60 |
+
return TaskSummary(
|
| 61 |
+
task_id=self.task_id,
|
| 62 |
+
difficulty=self.difficulty,
|
| 63 |
+
title=self.title,
|
| 64 |
+
goal=self.goal,
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
TASKS: List[TaskSpec] = [
|
| 69 |
+
TaskSpec(
|
| 70 |
+
task_id="py-pr-review-easy",
|
| 71 |
+
difficulty="easy",
|
| 72 |
+
title="Retry Delay Regression",
|
| 73 |
+
goal=(
|
| 74 |
+
"Review the pull request and identify the real bug introduced in the retry "
|
| 75 |
+
"delay helper before it ships."
|
| 76 |
+
),
|
| 77 |
+
repo_summary=(
|
| 78 |
+
"This service computes retry delays for background notification delivery. "
|
| 79 |
+
"The change is intended to relax validation for legacy callers."
|
| 80 |
+
),
|
| 81 |
+
visible_diff="\n".join(
|
| 82 |
+
[
|
| 83 |
+
"diff --git a/src/notifications/retry.py b/src/notifications/retry.py",
|
| 84 |
+
"@@",
|
| 85 |
+
"- if base_delay <= 0:",
|
| 86 |
+
"+ if base_delay < 0:",
|
| 87 |
+
" return 0.0",
|
| 88 |
+
]
|
| 89 |
+
),
|
| 90 |
+
file_contents={
|
| 91 |
+
"src/notifications/retry.py": "\n".join(
|
| 92 |
+
[
|
| 93 |
+
"from __future__ import annotations",
|
| 94 |
+
"",
|
| 95 |
+
"def calculate_retry_delay(attempt: int, base_delay: float = 2.0) -> float:",
|
| 96 |
+
' """Return the retry delay in seconds."""',
|
| 97 |
+
" if attempt < 0:",
|
| 98 |
+
' raise ValueError(\"attempt must be >= 0\")',
|
| 99 |
+
" if base_delay < 0:",
|
| 100 |
+
" return 0.0",
|
| 101 |
+
" return attempt / base_delay",
|
| 102 |
+
]
|
| 103 |
+
)
|
| 104 |
+
},
|
| 105 |
+
changed_files=("src/notifications/retry.py",),
|
| 106 |
+
rubric_issues=(
|
| 107 |
+
RubricIssue(
|
| 108 |
+
issue_id="zero-base-delay-divides",
|
| 109 |
+
file_path="src/notifications/retry.py",
|
| 110 |
+
line=7,
|
| 111 |
+
category="bug",
|
| 112 |
+
severity="warning",
|
| 113 |
+
keywords=("zero", "division", "base_delay"),
|
| 114 |
+
min_keyword_hits=2,
|
| 115 |
+
weight=1.0,
|
| 116 |
+
),
|
| 117 |
+
),
|
| 118 |
+
max_steps=4,
|
| 119 |
+
),
|
| 120 |
+
TaskSpec(
|
| 121 |
+
task_id="py-pr-review-medium",
|
| 122 |
+
difficulty="medium",
|
| 123 |
+
title="Coupon Billing Rollout",
|
| 124 |
+
goal=(
|
| 125 |
+
"Review the billing change and identify both the production regression and "
|
| 126 |
+
"the missing coverage that would have caught it."
|
| 127 |
+
),
|
| 128 |
+
repo_summary=(
|
| 129 |
+
"The billing service is adding coupon support for one-off invoices. The PR "
|
| 130 |
+
"touches both the service code and its unit tests."
|
| 131 |
+
),
|
| 132 |
+
visible_diff="\n".join(
|
| 133 |
+
[
|
| 134 |
+
"diff --git a/app/billing/invoice_service.py b/app/billing/invoice_service.py",
|
| 135 |
+
"@@",
|
| 136 |
+
" def charge_invoice(order: dict, gateway: Gateway) -> str:",
|
| 137 |
+
"- return gateway.charge(order[\"customer_id\"], order[\"amount_cents\"])",
|
| 138 |
+
"+ total = order[\"amount_cents\"]",
|
| 139 |
+
"+ coupon = order.get(\"coupon_code\")",
|
| 140 |
+
"+ if coupon:",
|
| 141 |
+
"+ discount = gateway.lookup_discount(coupon)",
|
| 142 |
+
"+ total = max(total - discount, 0)",
|
| 143 |
+
"+ return gateway.charge(order[\"customer_id\"], order[\"amount_cents\"])",
|
| 144 |
+
"",
|
| 145 |
+
"diff --git a/tests/test_invoice_service.py b/tests/test_invoice_service.py",
|
| 146 |
+
"@@",
|
| 147 |
+
" class FakeGateway:",
|
| 148 |
+
"+ def lookup_discount(self, coupon: str) -> int:",
|
| 149 |
+
"+ return 250",
|
| 150 |
+
]
|
| 151 |
+
),
|
| 152 |
+
file_contents={
|
| 153 |
+
"app/billing/invoice_service.py": "\n".join(
|
| 154 |
+
[
|
| 155 |
+
"from gateway import Gateway",
|
| 156 |
+
"",
|
| 157 |
+
"def charge_invoice(order: dict, gateway: Gateway) -> str:",
|
| 158 |
+
' total = order["amount_cents"]',
|
| 159 |
+
' coupon = order.get("coupon_code")',
|
| 160 |
+
" if coupon:",
|
| 161 |
+
" discount = gateway.lookup_discount(coupon)",
|
| 162 |
+
" total = max(total - discount, 0)",
|
| 163 |
+
' return gateway.charge(order["customer_id"], order["amount_cents"])',
|
| 164 |
+
]
|
| 165 |
+
),
|
| 166 |
+
"tests/test_invoice_service.py": "\n".join(
|
| 167 |
+
[
|
| 168 |
+
"from app.billing.invoice_service import charge_invoice",
|
| 169 |
+
"",
|
| 170 |
+
"class FakeGateway:",
|
| 171 |
+
" def lookup_discount(self, coupon: str) -> int:",
|
| 172 |
+
" return 250",
|
| 173 |
+
"",
|
| 174 |
+
" def charge(self, customer_id: str, amount_cents: int) -> str:",
|
| 175 |
+
" self.last_charge = (customer_id, amount_cents)",
|
| 176 |
+
' return "charge_123"',
|
| 177 |
+
"",
|
| 178 |
+
"def test_charge_invoice_without_coupon():",
|
| 179 |
+
" gateway = FakeGateway()",
|
| 180 |
+
' charge_invoice({"customer_id": "cus_1", "amount_cents": 1000}, gateway)',
|
| 181 |
+
' assert gateway.last_charge == ("cus_1", 1000)',
|
| 182 |
+
]
|
| 183 |
+
),
|
| 184 |
+
},
|
| 185 |
+
changed_files=("app/billing/invoice_service.py", "tests/test_invoice_service.py"),
|
| 186 |
+
rubric_issues=(
|
| 187 |
+
RubricIssue(
|
| 188 |
+
issue_id="discount-total-unused",
|
| 189 |
+
file_path="app/billing/invoice_service.py",
|
| 190 |
+
line=8,
|
| 191 |
+
category="bug",
|
| 192 |
+
severity="warning",
|
| 193 |
+
keywords=("discount", "total", "charge", "amount"),
|
| 194 |
+
min_keyword_hits=2,
|
| 195 |
+
weight=0.6,
|
| 196 |
+
),
|
| 197 |
+
RubricIssue(
|
| 198 |
+
issue_id="missing-coupon-test",
|
| 199 |
+
file_path="tests/test_invoice_service.py",
|
| 200 |
+
line=11,
|
| 201 |
+
category="testing",
|
| 202 |
+
severity="warning",
|
| 203 |
+
keywords=("missing", "test", "coupon", "discount"),
|
| 204 |
+
min_keyword_hits=2,
|
| 205 |
+
weight=0.4,
|
| 206 |
+
),
|
| 207 |
+
),
|
| 208 |
+
max_steps=5,
|
| 209 |
+
),
|
| 210 |
+
TaskSpec(
|
| 211 |
+
task_id="py-pr-review-hard",
|
| 212 |
+
difficulty="hard",
|
| 213 |
+
title="Async Job Runner Deduplication",
|
| 214 |
+
goal=(
|
| 215 |
+
"Review the async job-runner PR and find the subtle concurrency issues "
|
| 216 |
+
"without inventing extra problems."
|
| 217 |
+
),
|
| 218 |
+
repo_summary=(
|
| 219 |
+
"A shared webhook backfill service is deduplicating in-flight work with an "
|
| 220 |
+
"async task cache and writing the latest result for operators to inspect."
|
| 221 |
+
),
|
| 222 |
+
visible_diff="\n".join(
|
| 223 |
+
[
|
| 224 |
+
"diff --git a/app/jobs/runner.py b/app/jobs/runner.py",
|
| 225 |
+
"@@",
|
| 226 |
+
" async def run_job(job_id: str, payload: dict, worker) -> str:",
|
| 227 |
+
" if job_id in ACTIVE_RUNS:",
|
| 228 |
+
" return await ACTIVE_RUNS[job_id]",
|
| 229 |
+
"+ lock = asyncio.Lock()",
|
| 230 |
+
"+ async with lock:",
|
| 231 |
+
"+ task = asyncio.create_task(worker.run(payload))",
|
| 232 |
+
"+ ACTIVE_RUNS[job_id] = task",
|
| 233 |
+
" try:",
|
| 234 |
+
" result = await task",
|
| 235 |
+
" finally:",
|
| 236 |
+
" ACTIVE_RUNS.pop(job_id, None)",
|
| 237 |
+
"+ Path(\"latest-result.json\").write_text(result)",
|
| 238 |
+
" return result",
|
| 239 |
+
]
|
| 240 |
+
),
|
| 241 |
+
file_contents={
|
| 242 |
+
"app/jobs/runner.py": "\n".join(
|
| 243 |
+
[
|
| 244 |
+
"import asyncio",
|
| 245 |
+
"from pathlib import Path",
|
| 246 |
+
"",
|
| 247 |
+
"ACTIVE_RUNS: dict[str, asyncio.Task[str]] = {}",
|
| 248 |
+
"",
|
| 249 |
+
"async def run_job(job_id: str, payload: dict, worker) -> str:",
|
| 250 |
+
" if job_id in ACTIVE_RUNS:",
|
| 251 |
+
" return await ACTIVE_RUNS[job_id]",
|
| 252 |
+
"",
|
| 253 |
+
" lock = asyncio.Lock()",
|
| 254 |
+
" async with lock:",
|
| 255 |
+
" task = asyncio.create_task(worker.run(payload))",
|
| 256 |
+
" ACTIVE_RUNS[job_id] = task",
|
| 257 |
+
" try:",
|
| 258 |
+
" result = await task",
|
| 259 |
+
" finally:",
|
| 260 |
+
" ACTIVE_RUNS.pop(job_id, None)",
|
| 261 |
+
"",
|
| 262 |
+
' Path("latest-result.json").write_text(result)',
|
| 263 |
+
" return result",
|
| 264 |
+
]
|
| 265 |
+
),
|
| 266 |
+
"tests/test_runner.py": "\n".join(
|
| 267 |
+
[
|
| 268 |
+
"import pytest",
|
| 269 |
+
"",
|
| 270 |
+
"from app.jobs.runner import run_job",
|
| 271 |
+
"",
|
| 272 |
+
"class FakeWorker:",
|
| 273 |
+
" async def run(self, payload: dict) -> str:",
|
| 274 |
+
' return payload["job_id"]',
|
| 275 |
+
"",
|
| 276 |
+
"@pytest.mark.asyncio",
|
| 277 |
+
"async def test_run_job_returns_worker_result():",
|
| 278 |
+
" worker = FakeWorker()",
|
| 279 |
+
' result = await run_job("job-1", {"job_id": "job-1"}, worker)',
|
| 280 |
+
' assert result == "job-1"',
|
| 281 |
+
]
|
| 282 |
+
),
|
| 283 |
+
},
|
| 284 |
+
changed_files=("app/jobs/runner.py", "tests/test_runner.py"),
|
| 285 |
+
rubric_issues=(
|
| 286 |
+
RubricIssue(
|
| 287 |
+
issue_id="per-call-lock-race",
|
| 288 |
+
file_path="app/jobs/runner.py",
|
| 289 |
+
line=9,
|
| 290 |
+
category="bug",
|
| 291 |
+
severity="warning",
|
| 292 |
+
keywords=("lock", "race", "concurrent", "duplicate"),
|
| 293 |
+
min_keyword_hits=2,
|
| 294 |
+
weight=0.55,
|
| 295 |
+
),
|
| 296 |
+
RubricIssue(
|
| 297 |
+
issue_id="shared-output-file-race",
|
| 298 |
+
file_path="app/jobs/runner.py",
|
| 299 |
+
line=18,
|
| 300 |
+
category="maintainability",
|
| 301 |
+
severity="warning",
|
| 302 |
+
keywords=("latest", "result", "file", "concurrent", "overwrite"),
|
| 303 |
+
min_keyword_hits=2,
|
| 304 |
+
weight=0.45,
|
| 305 |
+
),
|
| 306 |
+
),
|
| 307 |
+
max_steps=6,
|
| 308 |
+
),
|
| 309 |
+
]
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
TASKS_BY_ID: Dict[str, TaskSpec] = {task.task_id: task for task in TASKS}
|
| 313 |
+
|
| 314 |
+
|
| 315 |
+
def list_task_descriptors() -> List[TaskDescriptor]:
|
| 316 |
+
"""Return public descriptors for all tasks."""
|
| 317 |
+
|
| 318 |
+
return [task.to_descriptor() for task in TASKS]
|
| 319 |
+
|
| 320 |
+
|
| 321 |
+
def list_task_summaries() -> List[TaskSummary]:
|
| 322 |
+
"""Return task summaries for lightweight route responses."""
|
| 323 |
+
|
| 324 |
+
return [task.to_summary() for task in TASKS]
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
def get_task(task_id: str) -> TaskSpec:
|
| 328 |
+
"""Return a task by id."""
|
| 329 |
+
|
| 330 |
+
try:
|
| 331 |
+
return TASKS_BY_ID[task_id]
|
| 332 |
+
except KeyError as exc: # pragma: no cover
|
| 333 |
+
raise ValueError(f"Unknown task_id: {task_id}") from exc
|
| 334 |
+
|
| 335 |
+
|
| 336 |
+
def task_ids() -> Iterable[str]:
|
| 337 |
+
"""Return task ids in benchmark order."""
|
| 338 |
+
|
| 339 |
+
return [task.task_id for task in TASKS]
|
| 340 |
+
|
summary/01_introduction_quickstart.md
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 01. Introduction & Quick Start
|
| 2 |
+
|
| 3 |
+
Source:
|
| 4 |
+
- https://meta-pytorch.org/OpenEnv/auto_getting_started/plot_01_introduction_quickstart.html
|
| 5 |
+
|
| 6 |
+
## Main idea
|
| 7 |
+
|
| 8 |
+
OpenEnv is a standardized framework for building, sharing, and using RL environments as typed, containerized services.
|
| 9 |
+
|
| 10 |
+
The official docs frame it as:
|
| 11 |
+
|
| 12 |
+
- Gym-style interaction
|
| 13 |
+
- Docker-based isolation
|
| 14 |
+
- typed contracts
|
| 15 |
+
- HTTP/WebSocket access
|
| 16 |
+
- easy sharing through Hugging Face
|
| 17 |
+
|
| 18 |
+
## Core loop
|
| 19 |
+
|
| 20 |
+
The RL interaction model is still the normal loop:
|
| 21 |
+
|
| 22 |
+
1. reset environment
|
| 23 |
+
2. observe state
|
| 24 |
+
3. choose action
|
| 25 |
+
4. call step
|
| 26 |
+
5. receive reward + next observation
|
| 27 |
+
6. repeat until done
|
| 28 |
+
|
| 29 |
+
The difference is that OpenEnv wraps this loop in a typed client/server system.
|
| 30 |
+
|
| 31 |
+
## Why OpenEnv instead of only Gym
|
| 32 |
+
|
| 33 |
+
The docs emphasize these advantages:
|
| 34 |
+
|
| 35 |
+
- type safety
|
| 36 |
+
- environment isolation through containers
|
| 37 |
+
- better reproducibility
|
| 38 |
+
- easier sharing and deployment
|
| 39 |
+
- language-agnostic communication
|
| 40 |
+
- cleaner debugging
|
| 41 |
+
|
| 42 |
+
The key contrast is:
|
| 43 |
+
|
| 44 |
+
- old style: raw arrays and same-process execution
|
| 45 |
+
- OpenEnv style: typed objects and isolated environment runtime
|
| 46 |
+
|
| 47 |
+
## Important mental model
|
| 48 |
+
|
| 49 |
+
OpenEnv treats environments more like services than in-process libraries.
|
| 50 |
+
|
| 51 |
+
That means:
|
| 52 |
+
|
| 53 |
+
- your environment logic can run separately from the agent code
|
| 54 |
+
- failures in the environment do not automatically crash the training loop
|
| 55 |
+
- deployment and usage are closer to how production systems work
|
| 56 |
+
|
| 57 |
+
## What this means for `python_env`
|
| 58 |
+
|
| 59 |
+
Your repo should keep these properties intact:
|
| 60 |
+
|
| 61 |
+
- typed `Action`, `Observation`, and evaluation models
|
| 62 |
+
- a clean environment class with `reset()`, `step()`, and `state`
|
| 63 |
+
- a client that hides transport details
|
| 64 |
+
- a deployable container
|
| 65 |
+
|
| 66 |
+
For hackathon purposes, this page is the justification for why your project is not just a script. It is a reusable environment artifact.
|
summary/02_using_environments.md
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 02. Using Environments
|
| 2 |
+
|
| 3 |
+
Source:
|
| 4 |
+
- https://meta-pytorch.org/OpenEnv/auto_getting_started/plot_02_using_environments.html
|
| 5 |
+
|
| 6 |
+
## Main idea
|
| 7 |
+
|
| 8 |
+
This page is about how users consume an existing OpenEnv environment.
|
| 9 |
+
|
| 10 |
+
The docs highlight three connection methods:
|
| 11 |
+
|
| 12 |
+
1. from Hugging Face Hub
|
| 13 |
+
2. from Docker image
|
| 14 |
+
3. from direct base URL
|
| 15 |
+
|
| 16 |
+
## Connection methods
|
| 17 |
+
|
| 18 |
+
### 1. From Hugging Face Hub
|
| 19 |
+
|
| 20 |
+
The easiest route for end users.
|
| 21 |
+
|
| 22 |
+
Typical flow:
|
| 23 |
+
|
| 24 |
+
- pull the image from the HF registry
|
| 25 |
+
- start the container locally
|
| 26 |
+
- connect to it
|
| 27 |
+
- clean it up on close
|
| 28 |
+
|
| 29 |
+
The docs show the pattern conceptually as:
|
| 30 |
+
|
| 31 |
+
```python
|
| 32 |
+
MyEnv.from_hub("owner/env-name")
|
| 33 |
+
```
|
| 34 |
+
|
| 35 |
+
## 2. From Docker image
|
| 36 |
+
|
| 37 |
+
Useful when:
|
| 38 |
+
|
| 39 |
+
- you already built the image locally
|
| 40 |
+
- you want reproducible local runs
|
| 41 |
+
- you do not want to depend on a live remote Space
|
| 42 |
+
|
| 43 |
+
Typical pattern:
|
| 44 |
+
|
| 45 |
+
```python
|
| 46 |
+
MyEnv.from_docker_image("my-env:latest")
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
## 3. Direct URL connection
|
| 50 |
+
|
| 51 |
+
Useful when:
|
| 52 |
+
|
| 53 |
+
- the server is already running
|
| 54 |
+
- you want to connect to localhost or a deployed Space
|
| 55 |
+
|
| 56 |
+
Typical pattern:
|
| 57 |
+
|
| 58 |
+
```python
|
| 59 |
+
MyEnv(base_url="http://localhost:8000")
|
| 60 |
+
```
|
| 61 |
+
|
| 62 |
+
## WebSocket model
|
| 63 |
+
|
| 64 |
+
The docs emphasize that OpenEnv uses WebSocket-backed sessions for persistent environment interaction.
|
| 65 |
+
|
| 66 |
+
Why this matters:
|
| 67 |
+
|
| 68 |
+
- lower overhead than stateless HTTP on every step
|
| 69 |
+
- cleaner session management
|
| 70 |
+
- better fit for multi-step RL loops
|
| 71 |
+
|
| 72 |
+
## Environment loop
|
| 73 |
+
|
| 74 |
+
The intended use pattern is:
|
| 75 |
+
|
| 76 |
+
1. connect
|
| 77 |
+
2. reset
|
| 78 |
+
3. repeatedly call `step(action)`
|
| 79 |
+
4. inspect `reward`, `done`, and `observation`
|
| 80 |
+
5. close cleanly
|
| 81 |
+
|
| 82 |
+
## What this means for `python_env`
|
| 83 |
+
|
| 84 |
+
Your environment should be easy to consume in all three modes:
|
| 85 |
+
|
| 86 |
+
- local URL
|
| 87 |
+
- local Docker image
|
| 88 |
+
- HF Space
|
| 89 |
+
|
| 90 |
+
That means the most important user-facing checks are:
|
| 91 |
+
|
| 92 |
+
- `reset()` works
|
| 93 |
+
- `step()` works
|
| 94 |
+
- the client can parse the observation correctly
|
| 95 |
+
- Docker image starts cleanly
|
| 96 |
+
- deployed Space responds on `/health`, `/docs`, and session routes
|
| 97 |
+
|
| 98 |
+
For hackathon validation, this page is basically the “user experience” standard you need to match.
|