uvpatel7271 commited on
Commit
c8e832f
·
verified ·
1 Parent(s): 1c8b7f1

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. Dockerfile +22 -71
  2. Project.md +1111 -0
  3. README.md +264 -258
  4. REWARD_SYSTEM_GUIDE.md +206 -0
  5. __init__.py +35 -11
  6. client.py +70 -41
  7. compat.py +92 -0
  8. examples/__init__.py +1 -0
  9. examples/python_review_examples.py +58 -0
  10. graders/__init__.py +16 -0
  11. graders/common.py +82 -0
  12. graders/optimization.py +167 -0
  13. graders/pytest_runner.py +149 -0
  14. graders/syntax.py +78 -0
  15. inference.py +462 -314
  16. models.py +185 -221
  17. openenv.yaml +20 -7
  18. openenv_python_env.egg-info/PKG-INFO +6 -3
  19. openenv_python_env.egg-info/SOURCES.txt +13 -5
  20. openenv_python_env.egg-info/requires.txt +4 -1
  21. pyproject.toml +33 -46
  22. pytest-cache-files-1f62ra1g/CACHEDIR.TAG +4 -0
  23. pytest-cache-files-1f62ra1g/README.md +8 -0
  24. pytest-cache-files-i2cpw3zw/CACHEDIR.TAG +4 -0
  25. pytest-cache-files-i2cpw3zw/README.md +8 -0
  26. pytest-cache-files-le0qcl0z/CACHEDIR.TAG +4 -0
  27. pytest-cache-files-le0qcl0z/README.md +8 -0
  28. pytest-cache-files-qm8xzmpt/CACHEDIR.TAG +4 -0
  29. pytest-cache-files-qm8xzmpt/README.md +8 -0
  30. pytest-cache-files-qun9v98v/CACHEDIR.TAG +4 -0
  31. pytest-cache-files-qun9v98v/README.md +8 -0
  32. pytest-cache-files-srp2otxc/CACHEDIR.TAG +4 -0
  33. pytest-cache-files-srp2otxc/README.md +8 -0
  34. pytest-cache-files-u6t7g29i/CACHEDIR.TAG +4 -0
  35. pytest-cache-files-u6t7g29i/README.md +8 -0
  36. pytest-cache-files-x1yzwik9/CACHEDIR.TAG +4 -0
  37. pytest-cache-files-x1yzwik9/README.md +8 -0
  38. server/__init__.py +5 -11
  39. server/app.py +114 -81
  40. server/code_review_env_environment.py +9 -0
  41. server/code_review_environment.py +5 -0
  42. server/env.py +1 -0
  43. server/env_safe.py +492 -0
  44. server/grading.py +147 -0
  45. server/python_env_environment.py +9 -421
  46. server/requirements.txt +6 -6
  47. server/static_review.py +273 -0
  48. server/task_bank.py +340 -0
  49. summary/01_introduction_quickstart.md +66 -0
  50. summary/02_using_environments.md +98 -0
Dockerfile CHANGED
@@ -1,81 +1,32 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the BSD-style license found in the
5
- # LICENSE file in the root directory of this source tree.
6
-
7
- # Multi-stage build using openenv-base
8
- # This Dockerfile is flexible and works for both:
9
- # - In-repo environments (with local OpenEnv sources)
10
- # - Standalone environments (with openenv from PyPI/Git)
11
- # The build script (openenv build) handles context detection and sets appropriate build args.
12
-
13
- ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
14
- FROM ${BASE_IMAGE} AS builder
15
-
16
- WORKDIR /app
17
-
18
- # Ensure git is available (required for installing dependencies from VCS)
19
- RUN apt-get update && \
20
- apt-get install -y --no-install-recommends git && \
21
- rm -rf /var/lib/apt/lists/*
22
-
23
- # Build argument to control whether we're building standalone or in-repo
24
- ARG BUILD_MODE=in-repo
25
- ARG ENV_NAME=python_env
26
-
27
- # Copy environment code (always at root of build context)
28
- COPY . /app/env
29
-
30
- # For in-repo builds, openenv is already vendored in the build context
31
- # For standalone builds, openenv will be installed via pyproject.toml
32
- WORKDIR /app/env
33
-
34
- # Ensure uv is available (for local builds where base image lacks it)
35
- RUN if ! command -v uv >/dev/null 2>&1; then \
36
- curl -LsSf https://astral.sh/uv/install.sh | sh && \
37
- mv /root/.local/bin/uv /usr/local/bin/uv && \
38
- mv /root/.local/bin/uvx /usr/local/bin/uvx; \
39
- fi
40
-
41
- # Install dependencies using uv sync
42
- # If uv.lock exists, use it; otherwise resolve on the fly
43
- RUN --mount=type=cache,target=/root/.cache/uv \
44
- if [ -f uv.lock ]; then \
45
- uv sync --frozen --no-install-project --no-editable; \
46
- else \
47
- uv sync --no-install-project --no-editable; \
48
- fi
49
-
50
- RUN --mount=type=cache,target=/root/.cache/uv \
51
- if [ -f uv.lock ]; then \
52
- uv sync --frozen --no-editable; \
53
- else \
54
- uv sync --no-editable; \
55
- fi
56
-
57
- # Final runtime stage
58
- FROM ${BASE_IMAGE}
59
 
60
  WORKDIR /app
61
 
62
- # Copy the virtual environment from builder
63
- COPY --from=builder /app/env/.venv /app/.venv
 
 
 
 
64
 
65
- # Copy the environment code
66
- COPY --from=builder /app/env /app/env
67
 
68
- # Set PATH to use the virtual environment
69
- ENV PATH="/app/.venv/bin:$PATH"
70
 
71
- # Set PYTHONPATH so imports work correctly
72
- ENV PYTHONPATH="/app/env:$PYTHONPATH"
 
 
 
 
73
 
74
  # Health check
75
- HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
76
- CMD curl -f http://localhost:8000/health || exit 1
77
 
78
- # Run the FastAPI server
79
- # The module path is constructed to work with the /app/env structure
80
  ENV ENABLE_WEB_INTERFACE=true
81
- CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]
 
1
+ FROM python:3.11-slim
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  WORKDIR /app
4
 
5
+ # Install system dependencies
6
+ RUN apt-get update && apt-get install -y --no-install-recommends \
7
+ gcc \
8
+ git \
9
+ curl \
10
+ && rm -rf /var/lib/apt/lists/*
11
 
12
+ # Copy source code
13
+ COPY . /app
14
 
15
+ # Install Python dependencies
16
+ RUN pip install --no-cache-dir -r requirements.txt
17
 
18
+ # Set environment variables
19
+ ENV PYTHONUNBUFFERED=1
20
+ ENV HOST=0.0.0.0
21
+ ENV PORT=8000
22
+ ENV WORKERS=1
23
+ ENV MAX_CONCURRENT_ENVS=16
24
 
25
  # Health check
26
+ HEALTHCHECK --interval=30s --timeout=5s --start-period=15s --retries=3 \
27
+ CMD curl -f http://localhost:${PORT}/health || exit 1
28
 
29
+ # Run FastAPI app
30
+ EXPOSE ${PORT}
31
  ENV ENABLE_WEB_INTERFACE=true
32
+ CMD ["python", "-m", "server.app"]
Project.md ADDED
@@ -0,0 +1,1111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ python inference.py --model gpt-3.5-turbo --base-url "http://localhost:8000/v1"
2
+ python inference.py --model gemini-2.0-flash --base-url "https://generativelanguage.googleapis.com/openai/"
3
+ python inference.py --model deepseek-chat --base-url "https://api.deepseek.com"# Python Env Project Guide
4
+
5
+ This document explains how to work with the `python_env` project end to end:
6
+
7
+ 1. What the environment is trying to do
8
+ 2. How the current code is structured
9
+ 3. How each route works
10
+ 4. How to test each route manually
11
+ 5. How to use the inference script
12
+ 6. How to prepare data so an RL or agent-training setup can learn more effectively
13
+ 7. How the project maps to the hackathon functional requirements
14
+
15
+ The goal is practical: after reading this file, you should be able to start the server, hit every route, understand what each response means, run the baseline, and know what data to collect next.
16
+
17
+ ## 1. Project Goal
18
+
19
+ This environment simulates a real software engineering workflow: Python code review.
20
+
21
+ An agent is given Python code and must:
22
+
23
+ - detect correctness bugs
24
+ - detect security risks
25
+ - detect maintainability problems
26
+ - detect obvious performance issues
27
+ - optionally suggest improved code
28
+
29
+ This is a valid real-world environment because code review is an actual human task used in engineering teams every day.
30
+
31
+ ## 2. High-Level Architecture
32
+
33
+ The project has four main parts:
34
+
35
+ - `models.py`
36
+ Defines the typed Pydantic models for actions, observations, evaluations, config, health, and direct-review payloads.
37
+
38
+ - `server/code_review_environment.py`
39
+ Implements the environment logic: `reset()`, `step()`, reward shaping, task progression, hints, history, and grading integration.
40
+
41
+ - `server/task_bank.py`, `server/grading.py`, `server/static_review.py`
42
+ These files define the benchmark tasks, deterministic graders, and direct static review rules.
43
+
44
+ - `server/app.py`
45
+ Exposes both:
46
+ - OpenEnv-compatible endpoints such as `/reset`, `/step`, `/state`, `/schema`, `/ws`
47
+ - custom REST endpoints such as `/health`, `/tasks`, `/review`, `/config`, `/history`
48
+
49
+ - `inference.py`
50
+ Runs an OpenAI-compatible model against the environment and writes a reproducible report.
51
+
52
+ ## 3. File-by-File Understanding
53
+
54
+ ### `models.py`
55
+
56
+ Important models:
57
+
58
+ - `ReviewFinding`
59
+ One code-review issue found by the agent.
60
+ Fields:
61
+ - `title`
62
+ - `line`
63
+ - `category`
64
+ - `severity`
65
+ - `rationale`
66
+ - `recommendation`
67
+ - `rule_id`
68
+
69
+ - `PythonReviewAction`
70
+ What the agent sends to the environment.
71
+ Fields:
72
+ - `operation`
73
+ - `findings`
74
+ - `patched_code`
75
+ - `note`
76
+
77
+ - `PythonReviewObservation`
78
+ What the environment returns back.
79
+ Fields:
80
+ - `task`
81
+ - `instructions`
82
+ - `feedback`
83
+ - `submitted_findings`
84
+ - `hints_used`
85
+ - `attempts_remaining`
86
+ - `evaluation`
87
+ - `score`
88
+ - `review_time_ms`
89
+ - inherited OpenEnv fields such as `reward`, `done`, `metadata`
90
+
91
+ - `TaskEvaluation`
92
+ Deterministic grading output.
93
+ Fields:
94
+ - `matched_reference_ids`
95
+ - `matched_findings`
96
+ - `total_findings`
97
+ - `false_positives`
98
+ - `duplicate_findings`
99
+ - `weighted_recall`
100
+ - `patch_score`
101
+ - `score`
102
+ - `passed`
103
+
104
+ ### `server/task_bank.py`
105
+
106
+ Contains the benchmark tasks.
107
+
108
+ Current tasks:
109
+
110
+ 1. `py-review-easy`
111
+ Detect unsafe `eval` and division-by-zero risk.
112
+
113
+ 2. `py-review-medium`
114
+ Detect mutable default list, quadratic membership check, and bare `except`.
115
+
116
+ 3. `py-review-hard`
117
+ Detect `shell=True` command injection, stale cache bug, and shared output file risk.
118
+
119
+ Each task contains:
120
+
121
+ - code to review
122
+ - hints
123
+ - reference findings
124
+ - pass threshold
125
+
126
+ ### `server/grading.py`
127
+
128
+ This is the benchmark grader.
129
+
130
+ It compares submitted findings to hidden reference findings and computes:
131
+
132
+ - weighted recall
133
+ - penalties for false positives
134
+ - penalties for duplicates
135
+ - optional patch quality score
136
+ - final score in `0.0` to `1.0`
137
+
138
+ This makes the task deterministic and reproducible, which is important for hackathon judging.
139
+
140
+ ### `server/static_review.py`
141
+
142
+ This powers the `/review` endpoint for arbitrary code snippets.
143
+
144
+ It uses AST inspection to detect:
145
+
146
+ - `eval` / `exec`
147
+ - mutable default arguments
148
+ - `shell=True`
149
+ - bare `except`
150
+ - list-membership-inside-loop performance smell
151
+ - syntax errors
152
+ - `print()` used in application logic
153
+
154
+ This is not the task grader. It is the direct-review helper.
155
+
156
+ ### Reward System
157
+
158
+ The reward system is **dynamic and multi-component**, designed to provide meaningful feedback at every step of the agent's learning process.
159
+
160
+ #### Reward Architecture
161
+
162
+ The system computes rewards using **6 independent components**:
163
+
164
+ 1. **Progress Reward** (max +0.25)
165
+ - Awarded when the agent improves the score from one step to the next
166
+ - Formula: `min(PROGRESS_SCALE * score_delta, 0.25)`
167
+ - Encourages continuous improvement
168
+
169
+ 2. **Syntax Reward** (max +0.35)
170
+ - One-time bonus awarded for fixing syntax errors (first time compiling)
171
+ - Applied once per episode when code transitions from uncompilable to compilable
172
+ - Acknowledges the critical first step of making code valid
173
+
174
+ 3. **Test Reward** (max +0.20)
175
+ - Based on improvement in test pass rate
176
+ - Computed as: `min(TEST_PASS_REWARD_SCALE * test_improvement_fraction, 0.20)`
177
+ - Rewards incremental progress on passing more tests
178
+
179
+ 4. **Quality Reward** (max +0.15)
180
+ - Based on AST-detected code quality metrics
181
+ - Rewards improvements in code structure, readability, and best practices
182
+ - Uses deterministic grader feedback
183
+
184
+ 5. **Stagnation Penalty** (−0.10)
185
+ - Applied when the agent takes action but code doesn't change
186
+ - Encourages the agent to edit the code rather than analyze repeatedly
187
+ - Configurable via `STAGNATION_PENALTY` constant
188
+
189
+ 6. **Regression Penalty** (scale −0.20)
190
+ - Applied when score decreases from previous step
191
+ - Formula: `REGRESSION_PENALTY_SCALE * abs(score_delta)`
192
+ - Discourages actions that make code worse
193
+
194
+ #### Reward Constants
195
+
196
+ Defined at the top of `server/env.py`:
197
+
198
+ ```python
199
+ SYNTAX_FIX_BONUS = 0.35 # One-time syntax reward
200
+ TEST_PASS_REWARD_SCALE = 0.30 # Per test improvement
201
+ QUALITY_BONUS_SCALE = 0.15 # Code quality improvement
202
+ PROGRESS_SCALE = 0.25 # Score improvement
203
+ COMPLETION_BONUS = 0.50 # Full correctness bonus
204
+ INVALID_ACTION_PENALTY = 0.15 # For unsupported actions
205
+ STAGNATION_PENALTY = 0.10 # For unchanged code
206
+ REGRESSION_PENALTY_SCALE = 0.20 # For score decline
207
+ TIMEOUT_PENALTY = 0.15 # For execution timeout
208
+ ```
209
+
210
+ #### Final Reward Computation
211
+
212
+ The final reward is:
213
+
214
+ ```
215
+ total = progress + syntax + test + quality - stagnation - regression
216
+ final_reward = clamp(total, -1.0, +1.0)
217
+ ```
218
+
219
+ The result is always between −1.0 and +1.0, providing bounded, interpretable feedback.
220
+
221
+ #### RewardDetails: Transparent Feedback
222
+
223
+ Every reward is returned as a `RewardDetails` object with these fields:
224
+
225
+ - `value`: The scalar reward for this step
226
+ - `syntax_reward`: Contribution from syntax fixes
227
+ - `test_reward`: Contribution from test improvements
228
+ - `quality_bonus`: Contribution from code quality
229
+ - `progress_delta`: Contribution from score improvement
230
+ - `stagnation_penalty`: Penalty for unchanged code
231
+ - `regression_penalty`: Penalty for score decline
232
+ - `prev_score` / `curr_score`: Score before and after the action
233
+ - `code_changed`: Whether the action modified the code
234
+ - `reason`: Human-readable explanation of the reward
235
+
236
+ This transparency is crucial for:
237
+ - Debugging agent behavior
238
+ - Understanding what drives reward
239
+ - Tuning the constants
240
+ - Training supervised models on reward components
241
+
242
+ #### Why This Design Helps Agents Learn
243
+
244
+ 1. **Non-Constant**: Different actions produce different rewards, enabling meaningful gradient signals
245
+ 2. **Progressive**: Early bonuses (syntax) are high; later improvements are smaller, promoting efficiency
246
+ 3. **Transparent**: Detailed component breakdown helps agents understand what matters
247
+ 4. **Bounded**: Clamping to [−1, 1] prevents reward hacking and explosion
248
+ 5. **Balanced**: Positive and negative signals teach precision and recall together
249
+
250
+ ### `server/code_review_environment.py`
251
+
252
+ This is the environment core.
253
+
254
+ Main methods:
255
+
256
+ - `reset()`
257
+ Rotates to the next task, resets episode state, and returns the initial observation.
258
+
259
+ - `step(action)`
260
+ Accepts a `PythonReviewAction`, grades it, shapes reward, updates history, and returns the new observation.
261
+
262
+ - `direct_review(code, context)`
263
+ Calls the static reviewer for arbitrary code.
264
+
265
+ - `list_tasks()`
266
+ Returns public descriptors for all tasks.
267
+
268
+ - `grade_task_submission(task_id, findings, patched_code)`
269
+ Grades a proposed submission against the deterministic rubric without stepping through an episode.
270
+
271
+ ### `server/app.py`
272
+
273
+ This file wires everything to FastAPI and OpenEnv.
274
+
275
+ Important note:
276
+
277
+ - OpenEnv endpoints are managed through `create_app(PythonEnvironment, PythonReviewAction, PythonReviewObservation)`
278
+ - custom routes such as `/health`, `/tasks`, `/review`, `/history`, `/config` use a singleton `python_env`
279
+
280
+ That means:
281
+
282
+ - `/reset` and `/step` are served by OpenEnv session handling
283
+ - `/review`, `/tasks`, `/config`, `/history` are served by the singleton helper instance
284
+
285
+ This is fine for startup and manual testing, but if you want one fully unified state model later, you should refactor custom routes to read from the same managed environment/session layer.
286
+
287
+ ## 4. Route-by-Route Guide
288
+
289
+ ### OpenEnv Routes
290
+
291
+ These are important for validation and agents.
292
+
293
+ #### `POST /reset`
294
+
295
+ Purpose:
296
+ - starts a new episode
297
+ - rotates to the next benchmark task
298
+ - returns an initial observation
299
+
300
+ Use this when:
301
+ - you want to start evaluating an agent on a task
302
+
303
+ #### `POST /step`
304
+
305
+ Purpose:
306
+ - submit agent actions
307
+ - get reward, observation, and done flag
308
+
309
+ Use this when:
310
+ - manually simulating agent steps
311
+ - testing reward shaping and grading
312
+
313
+ #### `GET /state`
314
+
315
+ Purpose:
316
+ - returns current OpenEnv session state, typically `episode_id` and `step_count`
317
+
318
+ Use this when:
319
+ - debugging session behavior
320
+
321
+ #### `GET /schema`
322
+
323
+ Purpose:
324
+ - shows the action/observation schema expected by OpenEnv
325
+
326
+ Use this when:
327
+ - debugging payload formats
328
+ - verifying OpenEnv compatibility
329
+
330
+ #### `WS /ws`
331
+
332
+ Purpose:
333
+ - persistent lower-latency session transport for clients
334
+
335
+ Use this when:
336
+ - building actual agent loops with the `EnvClient`
337
+
338
+ ### Custom REST Routes
339
+
340
+ #### `GET /health`
341
+
342
+ Purpose:
343
+ - quick health check for Docker and Hugging Face Spaces
344
+
345
+ Use this when:
346
+ - checking whether the server is alive
347
+ - validating deployment health
348
+
349
+ #### `GET /tasks`
350
+
351
+ Purpose:
352
+ - returns the three benchmark task descriptors
353
+
354
+ Use this when:
355
+ - reviewing available tasks
356
+ - building curriculum/eval metadata
357
+
358
+ #### `GET /tasks/{task_id}`
359
+
360
+ Purpose:
361
+ - returns one task descriptor
362
+
363
+ Use this when:
364
+ - inspecting a task before submitting findings
365
+
366
+ #### `POST /tasks/{task_id}/grade`
367
+
368
+ Purpose:
369
+ - grade a proposed set of findings against the deterministic task rubric
370
+
371
+ Use this when:
372
+ - validating benchmark grading directly
373
+ - building offline evaluation sets
374
+
375
+ #### `POST /review`
376
+
377
+ Purpose:
378
+ - run direct static review on arbitrary Python code
379
+
380
+ Use this when:
381
+ - testing the static analyzer
382
+ - building training examples
383
+ - verifying that common issues are caught
384
+
385
+ #### `GET /history`
386
+
387
+ Purpose:
388
+ - returns the singleton environment history
389
+
390
+ Use this when:
391
+ - checking what the custom singleton environment has processed
392
+
393
+ Note:
394
+ - this history is not the same as OpenEnv session history from `/step`
395
+
396
+ #### `DELETE /history`
397
+
398
+ Purpose:
399
+ - clears the singleton history
400
+
401
+ Use this when:
402
+ - resetting the custom review log before a test run
403
+
404
+ #### `GET /config`
405
+
406
+ Purpose:
407
+ - inspect config values such as penalties and task order
408
+
409
+ #### `PUT /config`
410
+
411
+ Purpose:
412
+ - update the environment config
413
+
414
+ Use this when:
415
+ - testing different reward penalties or task order
416
+
417
+ ## 5. Manual Testing: Step by Step
418
+
419
+ Start the server:
420
+
421
+ ```powershell
422
+ uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
423
+ ```
424
+
425
+ Open the docs:
426
+
427
+ ```text
428
+ http://127.0.0.1:8000/docs
429
+ ```
430
+
431
+ That is the easiest manual route explorer.
432
+
433
+ ### Test 1: Health
434
+
435
+ ```powershell
436
+ Invoke-RestMethod -Uri "http://127.0.0.1:8000/health" -Method Get
437
+ ```
438
+
439
+ Expected:
440
+ - `status` should be `ok`
441
+ - `task_count` should be `3`
442
+
443
+ ### Test 2: List Tasks
444
+
445
+ ```powershell
446
+ Invoke-RestMethod -Uri "http://127.0.0.1:8000/tasks" -Method Get
447
+ ```
448
+
449
+ Expected:
450
+ - three tasks
451
+ - each task has `task_id`, `difficulty`, `title`, `objective`, `code`
452
+
453
+ ### Test 3: Get One Task
454
+
455
+ ```powershell
456
+ Invoke-RestMethod -Uri "http://127.0.0.1:8000/tasks/py-review-easy" -Method Get
457
+ ```
458
+
459
+ ### Test 4: Direct Static Review
460
+
461
+ ```powershell
462
+ $body = @{
463
+ code = @"
464
+ def load_settings(config_text):
465
+ return eval(config_text)
466
+ "@
467
+ } | ConvertTo-Json
468
+
469
+ Invoke-RestMethod -Uri "http://127.0.0.1:8000/review" `
470
+ -Method Post `
471
+ -Body $body `
472
+ -ContentType "application/json"
473
+ ```
474
+
475
+ Expected:
476
+ - at least one issue
477
+ - one issue should have `rule_id = "avoid-eval"`
478
+
479
+ ### Test 5: Reset Episode
480
+
481
+ ```powershell
482
+ Invoke-RestMethod -Uri "http://127.0.0.1:8000/reset" `
483
+ -Method Post `
484
+ -Body "{}" `
485
+ -ContentType "application/json"
486
+ ```
487
+
488
+ Expected:
489
+ - an observation with a `task`
490
+ - `done = false`
491
+ - `reward = 0`
492
+
493
+ ### Test 6: Submit Partial Findings To `/step`
494
+
495
+ ```powershell
496
+ $body = @{
497
+ operation = "submit_findings"
498
+ findings = @(
499
+ @{
500
+ title = "Avoid eval on untrusted configuration data"
501
+ line = 2
502
+ category = "security"
503
+ severity = "critical"
504
+ rationale = "eval can execute attacker-controlled code."
505
+ recommendation = "Use json.loads or ast.literal_eval."
506
+ rule_id = "avoid-eval"
507
+ }
508
+ )
509
+ patched_code = $null
510
+ note = "First pass review"
511
+ } | ConvertTo-Json -Depth 5
512
+
513
+ Invoke-RestMethod -Uri "http://127.0.0.1:8000/step" `
514
+ -Method Post `
515
+ -Body $body `
516
+ -ContentType "application/json"
517
+ ```
518
+
519
+ Expected:
520
+ - positive reward
521
+ - improved `score`
522
+ - feedback mentioning a matched rubric item
523
+
524
+ ### Test 7: Request A Hint
525
+
526
+ ```powershell
527
+ $body = @{
528
+ operation = "request_hint"
529
+ findings = @()
530
+ patched_code = $null
531
+ note = "Need help"
532
+ } | ConvertTo-Json -Depth 5
533
+
534
+ Invoke-RestMethod -Uri "http://127.0.0.1:8000/step" `
535
+ -Method Post `
536
+ -Body $body `
537
+ -ContentType "application/json"
538
+ ```
539
+
540
+ Expected:
541
+ - small negative reward
542
+ - feedback containing `Hint 1: ...`
543
+
544
+ ### Test 8: Finalize A Full Submission
545
+
546
+ ```powershell
547
+ $body = @{
548
+ operation = "finalize"
549
+ findings = @(
550
+ @{
551
+ title = "Avoid eval on untrusted configuration data"
552
+ line = 2
553
+ category = "security"
554
+ severity = "critical"
555
+ rationale = "eval can execute attacker-controlled code."
556
+ recommendation = "Use json.loads or ast.literal_eval."
557
+ rule_id = "avoid-eval"
558
+ },
559
+ @{
560
+ title = "Default count of zero causes a division by zero"
561
+ line = 5
562
+ category = "bug"
563
+ severity = "warning"
564
+ rationale = "count defaults to zero and division crashes."
565
+ recommendation = "Validate count before dividing."
566
+ rule_id = "division-by-zero-default"
567
+ }
568
+ )
569
+ patched_code = $null
570
+ note = "Final review"
571
+ } | ConvertTo-Json -Depth 6
572
+
573
+ Invoke-RestMethod -Uri "http://127.0.0.1:8000/step" `
574
+ -Method Post `
575
+ -Body $body `
576
+ -ContentType "application/json"
577
+ ```
578
+
579
+ Expected:
580
+ - `done = true`
581
+ - `evaluation.passed = true`
582
+ - `score` near or above task threshold
583
+
584
+ ### Test 9: Inspect State
585
+
586
+ ```powershell
587
+ Invoke-RestMethod -Uri "http://127.0.0.1:8000/state" -Method Get
588
+ ```
589
+
590
+ ### Test 10: Inspect Schemas
591
+
592
+ ```powershell
593
+ Invoke-RestMethod -Uri "http://127.0.0.1:8000/schema" -Method Get
594
+ ```
595
+
596
+ ### Test 11: Grade A Task Without Running An Episode
597
+
598
+ ```powershell
599
+ $body = @{
600
+ operation = "submit_findings"
601
+ findings = @(
602
+ @{
603
+ title = "shell=True with interpolated input allows command injection"
604
+ line = 10
605
+ category = "security"
606
+ severity = "critical"
607
+ rationale = "The command string includes user input and runs via shell."
608
+ recommendation = "Pass args as a list and keep shell=False."
609
+ rule_id = "shell-true-command-injection"
610
+ }
611
+ )
612
+ patched_code = $null
613
+ note = "Offline grader test"
614
+ } | ConvertTo-Json -Depth 6
615
+
616
+ Invoke-RestMethod -Uri "http://127.0.0.1:8000/tasks/py-review-hard/grade" `
617
+ -Method Post `
618
+ -Body $body `
619
+ -ContentType "application/json"
620
+ ```
621
+
622
+ ### Test 12: Config Read And Update
623
+
624
+ Read:
625
+
626
+ ```powershell
627
+ Invoke-RestMethod -Uri "http://127.0.0.1:8000/config" -Method Get
628
+ ```
629
+
630
+ Update:
631
+
632
+ ```powershell
633
+ $body = @{
634
+ task_order = @("py-review-easy", "py-review-medium", "py-review-hard")
635
+ max_steps_per_task = 4
636
+ hint_penalty = 0.05
637
+ false_positive_penalty = 0.08
638
+ duplicate_penalty = 0.03
639
+ patch_bonus_multiplier = 0.2
640
+ max_history_entries = 50
641
+ } | ConvertTo-Json
642
+
643
+ Invoke-RestMethod -Uri "http://127.0.0.1:8000/config" `
644
+ -Method Put `
645
+ -Body $body `
646
+ -ContentType "application/json"
647
+ ```
648
+
649
+ ### Test 13: History
650
+
651
+ ```powershell
652
+ Invoke-RestMethod -Uri "http://127.0.0.1:8000/history" -Method Get
653
+ ```
654
+
655
+ Clear:
656
+
657
+ ```powershell
658
+ Invoke-RestMethod -Uri "http://127.0.0.1:8000/history" -Method Delete
659
+ ```
660
+
661
+ ## 6. How To Test Using The Inference Script
662
+
663
+ The inference script is for model-vs-environment evaluation.
664
+
665
+ ### Required Variables
666
+
667
+ ```powershell
668
+ $env:API_BASE_URL="https://api.openai.com/v1"
669
+ $env:MODEL_NAME="gpt-4.1-mini"
670
+ $env:OPENAI_API_KEY="your_key_here"
671
+ ```
672
+
673
+ If you want it to hit your local server instead of launching Docker:
674
+
675
+ ```powershell
676
+ $env:ENV_BASE_URL="http://127.0.0.1:8000"
677
+ ```
678
+
679
+ Optional:
680
+
681
+ ```powershell
682
+ $env:MAX_TASKS="3"
683
+ $env:MAX_STEPS="3"
684
+ $env:INFERENCE_REPORT_PATH="inference_results.json"
685
+ ```
686
+
687
+ Run:
688
+
689
+ ```powershell
690
+ python inference.py
691
+ ```
692
+
693
+ What it does:
694
+
695
+ 1. connects to the environment
696
+ 2. resets through up to 3 tasks
697
+ 3. sends task code and feedback to the model
698
+ 4. expects strict JSON findings back
699
+ 5. submits them through `step()`
700
+ 6. logs score and reward per step
701
+ 7. writes a final report JSON file
702
+
703
+ ### How To Interpret The Output
704
+
705
+ Focus on:
706
+
707
+ - `mean_score`
708
+ Overall average benchmark score
709
+
710
+ - per-task `score`
711
+ How well the model solved each task
712
+
713
+ - `passed`
714
+ Whether score met that task’s threshold
715
+
716
+ - step logs
717
+ Show whether the model is improving over trajectory or getting stuck
718
+
719
+ If the model keeps returning empty findings:
720
+
721
+ - improve the system prompt
722
+ - reduce task ambiguity
723
+ - add examples of desired findings
724
+ - ensure the model endpoint supports the chosen format well
725
+
726
+ ## 7. How To Build Better Training Data
727
+
728
+ If you want an RL environment to actually learn, the biggest bottleneck is data quality.
729
+
730
+ You need more than just three final benchmark tasks. You need trajectories, partial attempts, and failure examples.
731
+
732
+ ### Data Types You Should Collect
733
+
734
+ #### A. Gold Task Rubrics
735
+
736
+ For each task, store:
737
+
738
+ - code snippet
739
+ - hidden reference findings
740
+ - severity
741
+ - category
742
+ - expected line numbers
743
+ - good recommendations
744
+
745
+ This is already partially represented by `server/task_bank.py`.
746
+
747
+ #### B. Positive Demonstrations
748
+
749
+ Create solved examples where the review is high quality.
750
+
751
+ Each example should include:
752
+
753
+ - task code
754
+ - one or more strong findings
755
+ - strong rationales
756
+ - strong recommendations
757
+ - optional patch
758
+ - final score
759
+
760
+ This helps supervised warm-start and behavior cloning.
761
+
762
+ #### C. Partial Trajectories
763
+
764
+ This is important for RL.
765
+
766
+ Store intermediate attempts like:
767
+
768
+ - first attempt finds one issue
769
+ - second attempt adds another issue
770
+ - third attempt finalizes
771
+
772
+ This is what teaches agents to improve over time, not just emit one final perfect answer.
773
+
774
+ #### D. Negative Examples
775
+
776
+ You should also store:
777
+
778
+ - false positives
779
+ - irrelevant complaints
780
+ - duplicate findings
781
+ - hallucinated issues
782
+ - weak recommendations
783
+
784
+ Why:
785
+ - the reward function penalizes these
786
+ - the model must learn precision, not just recall
787
+
788
+ #### E. Hint Usage Examples
789
+
790
+ Store trajectories where:
791
+
792
+ - the agent requests a hint
793
+ - then improves its findings
794
+
795
+ This teaches policy behavior around when hints are worth the penalty.
796
+
797
+ #### F. Patch Examples
798
+
799
+ For tasks where patch quality matters, store:
800
+
801
+ - original code
802
+ - weak patch
803
+ - good patch
804
+ - patch score
805
+
806
+ This helps the model learn that code edits should remove actual problems, not just change formatting.
807
+
808
+ ## 8. Recommended Dataset Format
809
+
810
+ Use JSONL so it is easy to stream and train on.
811
+
812
+ ### Benchmark Task Record
813
+
814
+ ```json
815
+ {
816
+ "task_id": "py-review-easy",
817
+ "difficulty": "easy",
818
+ "code": "def load_settings(config_text):\n return eval(config_text)",
819
+ "reference_findings": [
820
+ {
821
+ "rule_id": "avoid-eval",
822
+ "line": 2,
823
+ "category": "security",
824
+ "severity": "critical"
825
+ }
826
+ ]
827
+ }
828
+ ```
829
+
830
+ ### Trajectory Record
831
+
832
+ ```json
833
+ {
834
+ "task_id": "py-review-medium",
835
+ "episode_id": "abc123",
836
+ "steps": [
837
+ {
838
+ "observation_feedback": "Review the Python snippet.",
839
+ "action": {
840
+ "operation": "submit_findings",
841
+ "findings": [
842
+ {
843
+ "title": "Mutable default argument leaks state",
844
+ "line": 1,
845
+ "category": "bug",
846
+ "severity": "warning"
847
+ }
848
+ ]
849
+ },
850
+ "reward": 0.35,
851
+ "score": 0.35
852
+ },
853
+ {
854
+ "observation_feedback": "Matched 1 new rubric item(s): mutable-default-list",
855
+ "action": {
856
+ "operation": "finalize",
857
+ "findings": [
858
+ {
859
+ "title": "Mutable default argument leaks state",
860
+ "line": 1,
861
+ "category": "bug",
862
+ "severity": "warning"
863
+ },
864
+ {
865
+ "title": "Bare except hides failures",
866
+ "line": 12,
867
+ "category": "maintainability",
868
+ "severity": "warning"
869
+ }
870
+ ]
871
+ },
872
+ "reward": 0.27,
873
+ "score": 0.62
874
+ }
875
+ ]
876
+ }
877
+ ```
878
+
879
+ ## 9. How To Make RL Learn Better
880
+
881
+ ### A. Add More Tasks
882
+
883
+ Three tasks are enough for the minimum requirement, but not enough for strong training.
884
+
885
+ You should expand with:
886
+
887
+ - file I/O bugs
888
+ - API misuse
889
+ - SQL injection
890
+ - unsafe deserialization
891
+ - concurrency issues
892
+ - caching mistakes
893
+ - resource leaks
894
+ - logic edge cases
895
+
896
+ Target:
897
+
898
+ - 50 to 200 deterministic tasks
899
+ - grouped by difficulty and domain
900
+
901
+ ### B. Add More Partial Reward Signals
902
+
903
+ Current reward is already better than binary success/fail, but you can improve it.
904
+
905
+ Possible additions:
906
+
907
+ - small bonus when the first critical issue is found early
908
+ - higher reward for critical issues than style issues
909
+ - bonus when rationale quality is high
910
+ - bonus when recommendation mentions a correct mitigation pattern
911
+ - penalty if line numbers are missing when they should be known
912
+
913
+ ### C. Improve Context In Observation
914
+
915
+ Right now the observation already gives:
916
+
917
+ - task metadata
918
+ - previous feedback
919
+ - submitted findings
920
+ - attempts remaining
921
+
922
+ You can improve learning further by including:
923
+
924
+ - a short list of matched findings so far
925
+ - a short list of remaining categories not yet covered
926
+ - normalized review rubric hints without leaking answers
927
+ - last action summary
928
+
929
+ This helps the agent reason about what it already did and what is still missing.
930
+
931
+ ### D. Separate Training Tasks From Benchmark Tasks
932
+
933
+ Important:
934
+
935
+ - training tasks should be large and varied
936
+ - benchmark tasks should stay hidden and fixed
937
+
938
+ Do not train directly on the same exact benchmark set you plan to judge on.
939
+
940
+ ### E. Add Preference Data
941
+
942
+ You can train preference models on:
943
+
944
+ - strong vs weak findings
945
+ - precise vs vague recommendations
946
+ - useful vs noisy patches
947
+
948
+ This is valuable for ranking quality beyond exact rubric matches.
949
+
950
+ ## 10. Functional Requirements Mapping
951
+
952
+ Here is how your environment should be judged against the stated requirements.
953
+
954
+ ### Requirement: Real-World Task Simulation
955
+
956
+ Status:
957
+ - satisfied in direction
958
+
959
+ Why:
960
+ - code review is a genuine engineering task
961
+
962
+ How to improve further:
963
+ - expand beyond tiny snippets into multi-function modules
964
+ - include operational and maintainability review, not just security lints
965
+
966
+ ### Requirement: OpenEnv Spec Compliance
967
+
968
+ Status:
969
+ - mostly implemented in code
970
+
971
+ Implemented pieces:
972
+ - typed action model
973
+ - typed observation model
974
+ - `reset()`
975
+ - `step()`
976
+ - `state`
977
+ - `openenv.yaml`
978
+ - FastAPI/OpenEnv routes
979
+
980
+ What you still need to verify:
981
+ - `openenv validate`
982
+ - schema compatibility under your installed OpenEnv version
983
+
984
+ ### Requirement: Minimum 3 Tasks With Agent Graders
985
+
986
+ Status:
987
+ - implemented
988
+
989
+ You have:
990
+ - easy
991
+ - medium
992
+ - hard
993
+ - deterministic grader returning `0.0` to `1.0`
994
+
995
+ ### Requirement: Meaningful Reward Function
996
+
997
+ Status:
998
+ - implemented
999
+
1000
+ Current reward signals:
1001
+ - new rubric matches
1002
+ - false positive penalties
1003
+ - duplicate penalties
1004
+ - hint penalties
1005
+ - patch bonus
1006
+ - finalize pass bonus
1007
+
1008
+ ### Requirement: Baseline Inference Script
1009
+
1010
+ Status:
1011
+ - implemented
1012
+
1013
+ Current `inference.py`:
1014
+ - uses OpenAI client
1015
+ - reads env vars
1016
+ - runs tasks
1017
+ - writes report
1018
+
1019
+ What to verify:
1020
+ - actual runtime under 20 minutes
1021
+ - reproducible output with your chosen model endpoint
1022
+
1023
+ ### Requirement: HF Spaces + Docker
1024
+
1025
+ Status:
1026
+ - code is prepared
1027
+
1028
+ You still need to verify:
1029
+
1030
+ - `docker build -f server/Dockerfile .`
1031
+ - local container startup
1032
+ - `openenv push`
1033
+ - `/health` returns 200 on the deployed Space
1034
+
1035
+ ## 11. Recommended Manual Validation Checklist
1036
+
1037
+ Before submission, run these in order:
1038
+
1039
+ 1. Start server locally
1040
+ 2. Hit `/health`
1041
+ 3. Hit `/docs`
1042
+ 4. Test `/tasks`
1043
+ 5. Test `/review` with unsafe examples
1044
+ 6. Test `/reset`
1045
+ 7. Test `/step` with partial findings
1046
+ 8. Test `/step` with finalize
1047
+ 9. Test `/tasks/{task_id}/grade`
1048
+ 10. Run `pytest`
1049
+ 11. Run `openenv validate`
1050
+ 12. Run `python inference.py`
1051
+ 13. Build Docker image
1052
+ 14. Deploy to Hugging Face Space
1053
+ 15. Re-test `/health` and `/reset` on the live Space
1054
+
1055
+ ## 12. Suggested Immediate Next Steps
1056
+
1057
+ If you want the environment to become stronger quickly, do this next:
1058
+
1059
+ 1. Add 10 to 20 more benchmark-style tasks in `server/task_bank.py`
1060
+ 2. Save solved and failed trajectories as JSONL files under a new `dataset/` directory
1061
+ 3. Refactor custom route state so `/history` and OpenEnv `/step` share one coherent session story
1062
+ 4. Run `openenv validate`
1063
+ 5. Run `inference.py` against your local server and inspect the report
1064
+
1065
+ ## 13. Quick Commands Summary
1066
+
1067
+ Start server:
1068
+
1069
+ ```powershell
1070
+ uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
1071
+ ```
1072
+
1073
+ Open docs:
1074
+
1075
+ ```text
1076
+ http://127.0.0.1:8000/docs
1077
+ ```
1078
+
1079
+ Run example tests:
1080
+
1081
+ ```powershell
1082
+ python -m pytest tests -q
1083
+ ```
1084
+
1085
+ Run inference locally:
1086
+
1087
+ ```powershell
1088
+ $env:API_BASE_URL="https://api.openai.com/v1"
1089
+ $env:MODEL_NAME="gpt-4.1-mini"
1090
+ $env:OPENAI_API_KEY="your_key"
1091
+ $env:ENV_BASE_URL="http://127.0.0.1:8000"
1092
+ python inference.py
1093
+ ```
1094
+
1095
+ Validate OpenEnv:
1096
+
1097
+ ```powershell
1098
+ openenv validate
1099
+ ```
1100
+
1101
+ Build Docker:
1102
+
1103
+ ```powershell
1104
+ docker build -t python_env-env:latest -f server/Dockerfile .
1105
+ ```
1106
+
1107
+ Deploy:
1108
+
1109
+ ```powershell
1110
+ openenv push
1111
+ ```
README.md CHANGED
@@ -1,266 +1,272 @@
1
  ---
2
- title: Python Env Environment Server
3
- emoji: 🎶
4
- colorFrom: purple
5
- colorTo: red
6
  sdk: docker
7
- pinned: false
8
  app_port: 8000
9
  base_path: /web
 
10
  tags:
11
  - openenv
 
12
  ---
13
 
14
- # Python Env Environment
15
-
16
- A simple test environment that echoes back messages. Perfect for testing the env APIs as well as demonstrating environment usage patterns.
17
-
18
- ## Quick Start
19
-
20
- The simplest way to use the Python Env environment is through the `PythonEnv` class:
21
-
22
- ```python
23
- from python_env import PythonAction, PythonEnv
24
-
25
- try:
26
- # Create environment from Docker image
27
- python_envenv = PythonEnv.from_docker_image("python_env-env:latest")
28
-
29
- # Reset
30
- result = python_envenv.reset()
31
- print(f"Reset: {result.observation.echoed_message}")
32
-
33
- # Send multiple messages
34
- messages = ["Hello, World!", "Testing echo", "Final message"]
35
-
36
- for msg in messages:
37
- result = python_envenv.step(PythonAction(message=msg))
38
- print(f"Sent: '{msg}'")
39
- print(f" → Echoed: '{result.observation.echoed_message}'")
40
- print(f" → Length: {result.observation.message_length}")
41
- print(f" → Reward: {result.reward}")
42
-
43
- finally:
44
- # Always clean up
45
- python_envenv.close()
46
- ```
47
-
48
- That's it! The `PythonEnv.from_docker_image()` method handles:
49
- - Starting the Docker container
50
- - Waiting for the server to be ready
51
- - Connecting to the environment
52
- - Container cleanup when you call `close()`
53
-
54
- ## Building the Docker Image
55
-
56
- Before using the environment, you need to build the Docker image:
57
-
58
- ```bash
59
- # From project root
60
- docker build -t python_env-env:latest -f server/Dockerfile .
61
- ```
62
-
63
- ## Deploying to Hugging Face Spaces
64
-
65
- You can easily deploy your OpenEnv environment to Hugging Face Spaces using the `openenv push` command:
66
-
67
- ```bash
68
- # From the environment directory (where openenv.yaml is located)
69
- openenv push
70
-
71
- # Or specify options
72
- openenv push --namespace my-org --private
73
- ```
74
-
75
- The `openenv push` command will:
76
- 1. Validate that the directory is an OpenEnv environment (checks for `openenv.yaml`)
77
- 2. Prepare a custom build for Hugging Face Docker space (enables web interface)
78
- 3. Upload to Hugging Face (ensuring you're logged in)
79
-
80
- ### Prerequisites
81
-
82
- - Authenticate with Hugging Face: The command will prompt for login if not already authenticated
83
-
84
- ### Options
85
-
86
- - `--directory`, `-d`: Directory containing the OpenEnv environment (defaults to current directory)
87
- - `--repo-id`, `-r`: Repository ID in format 'username/repo-name' (defaults to 'username/env-name' from openenv.yaml)
88
- - `--base-image`, `-b`: Base Docker image to use (overrides Dockerfile FROM)
89
- - `--private`: Deploy the space as private (default: public)
90
-
91
- ### Examples
92
-
93
- ```bash
94
- # Push to your personal namespace (defaults to username/env-name from openenv.yaml)
95
- openenv push
96
-
97
- # Push to a specific repository
98
- openenv push --repo-id my-org/my-env
99
-
100
- # Push with a custom base image
101
- openenv push --base-image ghcr.io/meta-pytorch/openenv-base:latest
102
-
103
- # Push as a private space
104
- openenv push --private
105
-
106
- # Combine options
107
- openenv push --repo-id my-org/my-env --base-image custom-base:latest --private
108
- ```
109
-
110
- After deployment, your space will be available at:
111
- `https://huggingface.co/spaces/<repo-id>`
112
-
113
- The deployed space includes:
114
- - **Web Interface** at `/web` - Interactive UI for exploring the environment
115
- - **API Documentation** at `/docs` - Full OpenAPI/Swagger interface
116
- - **Health Check** at `/health` - Container health monitoring
117
- - **WebSocket** at `/ws` - Persistent session endpoint for low-latency interactions
118
-
119
- ## Environment Details
120
-
121
- ### Action
122
- **PythonAction**: Contains a single field
123
- - `message` (str) - The message to echo back
124
-
125
- ### Observation
126
- **PythonObservation**: Contains the echo response and metadata
127
- - `echoed_message` (str) - The message echoed back
128
- - `message_length` (int) - Length of the message
129
- - `reward` (float) - Reward based on message length (length × 0.1)
130
- - `done` (bool) - Always False for echo environment
131
- - `metadata` (dict) - Additional info like step count
132
-
133
- ### Reward
134
- The reward is calculated as: `message_length × 0.1`
135
- - "Hi" → reward: 0.2
136
- - "Hello, World!" → reward: 1.3
137
- - Empty message reward: 0.0
138
-
139
- ## Advanced Usage
140
-
141
- ### Connecting to an Existing Server
142
-
143
- If you already have a Python Env environment server running, you can connect directly:
144
-
145
- ```python
146
- from python_env import PythonEnv
147
-
148
- # Connect to existing server
149
- python_envenv = PythonEnv(base_url="<ENV_HTTP_URL_HERE>")
150
-
151
- # Use as normal
152
- result = python_envenv.reset()
153
- result = python_envenv.step(PythonAction(message="Hello!"))
154
- ```
155
-
156
- Note: When connecting to an existing server, `python_envenv.close()` will NOT stop the server.
157
-
158
- ### Using the Context Manager
159
-
160
- The client supports context manager usage for automatic connection management:
161
-
162
- ```python
163
- from python_env import PythonAction, PythonEnv
164
-
165
- # Connect with context manager (auto-connects and closes)
166
- with PythonEnv(base_url="http://localhost:8000") as env:
167
- result = env.reset()
168
- print(f"Reset: {result.observation.echoed_message}")
169
- # Multiple steps with low latency
170
- for msg in ["Hello", "World", "!"]:
171
- result = env.step(PythonAction(message=msg))
172
- print(f"Echoed: {result.observation.echoed_message}")
173
- ```
174
-
175
- The client uses WebSocket connections for:
176
- - **Lower latency**: No HTTP connection overhead per request
177
- - **Persistent session**: Server maintains your environment state
178
- - **Efficient for episodes**: Better for many sequential steps
179
-
180
- ### Concurrent WebSocket Sessions
181
-
182
- The server supports multiple concurrent WebSocket connections. To enable this,
183
- modify `server/app.py` to use factory mode:
184
-
185
- ```python
186
- # In server/app.py - use factory mode for concurrent sessions
187
- app = create_app(
188
- PythonEnvironment, # Pass class, not instance
189
- PythonAction,
190
- PythonObservation,
191
- max_concurrent_envs=4, # Allow 4 concurrent sessions
192
- )
193
- ```
194
-
195
- Then multiple clients can connect simultaneously:
196
-
197
- ```python
198
- from python_env import PythonAction, PythonEnv
199
- from concurrent.futures import ThreadPoolExecutor
200
-
201
- def run_episode(client_id: int):
202
- with PythonEnv(base_url="http://localhost:8000") as env:
203
- result = env.reset()
204
- for i in range(10):
205
- result = env.step(PythonAction(message=f"Client {client_id}, step {i}"))
206
- return client_id, result.observation.message_length
207
-
208
- # Run 4 episodes concurrently
209
- with ThreadPoolExecutor(max_workers=4) as executor:
210
- results = list(executor.map(run_episode, range(4)))
211
- ```
212
-
213
- ## Development & Testing
214
-
215
- ### Direct Environment Testing
216
-
217
- Test the environment logic directly without starting the HTTP server:
218
-
219
- ```bash
220
- # From the server directory
221
- python3 server/python_env_environment.py
222
- ```
223
-
224
- This verifies that:
225
- - Environment resets correctly
226
- - Step executes actions properly
227
- - State tracking works
228
- - Rewards are calculated correctly
229
-
230
- ### Running Locally
231
-
232
- Run the server locally for development:
233
-
234
- ```bash
235
- uvicorn server.app:app --reload
236
- ```
237
-
238
- ## Project Structure
239
-
240
- ```
241
- python_env/
242
- ├── .dockerignore # Docker build exclusions
243
- ├── __init__.py # Module exports
244
- ├── README.md # This file
245
- ├── openenv.yaml # OpenEnv manifest
246
- ├── pyproject.toml # Project metadata and dependencies
247
- ├── uv.lock # Locked dependencies (generated)
248
- ├── client.py # PythonEnv client
249
- ├── models.py # Action and Observation models
250
- └── server/
251
- ├── __init__.py # Server module exports
252
- ├── python_env_environment.py # Core environment logic
253
- ├── app.py # FastAPI application (HTTP + WebSocket endpoints)
254
- └── Dockerfile # Container image definition
255
- ```
256
- ---------------------------------------
257
-
258
- cd F:\python_env
259
- # Edit your environment implementation in server/python_env_environment.py
260
- # Edit your models in models.py
261
- # Install dependencies: uv sync
262
-
263
- # To integrate into OpenEnv repo:
264
- # 1. Copy this directory to <repo_root>/envs/python_env_env
265
- # 2. Build from repo root: docker build -t python_env_env:latest -f envs/python_env_env/server/Dockerfile .
266
- # 3. Run your image: docker run -p 8000:8000 python_env_env:latest
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Python Code Review Environment Server
 
 
 
3
  sdk: docker
 
4
  app_port: 8000
5
  base_path: /web
6
+ pinned: false
7
  tags:
8
  - openenv
9
+ - code-review
10
  ---
11
 
12
+ # Python Code Review Environment
13
+
14
+ A production-grade OpenEnv environment for Python code review, repair, and optimization tasks. This environment simulates real-world developer workflows where an AI agent reviews, fixes, and improves Python code.
15
+
16
+ ## Overview
17
+
18
+ **`python_code_review_env`** is a deterministic benchmark environment featuring:
19
+
20
+ - ✅ **3 real-world tasks** with increasing difficulty (Syntax, Bug Fix, Optimization)
21
+ - **Deterministic graders** using AST analysis, pytest execution, and performance benchmarking
22
+ - ✅ **OpenAI-compatible API** supporting free/open models (Gemini, DeepSeek, Together, OpenRouter)
23
+ - ✅ **Production-ready Docker** deployment for Hugging Face Spaces
24
+ - **Structured Observations & Actions** following OpenEnv spec
25
+ - **Rich reward shaping** with bonuses for syntax fixes, test passes, and optimization
26
+
27
+ ## Tasks
28
+
29
+ ### 1. 🟢 Easy: Syntax Fixing
30
+
31
+ **Task ID**: `syntax-fix-easy`
32
+
33
+ Fix broken Python code with syntax errors.
34
+
35
+ - **Difficulty**: Easy
36
+ - **Goal**: Repair syntax errors to make code compile
37
+ - **Starter Code**: Function with missing closing parenthesis
38
+ - **Grading**: Compilation check + code similarity to reference
39
+ - **Score Range**: 0.0–1.0
40
+
41
+ ### 2. 🟡 Medium: Bug Fixing
42
+
43
+ **Task ID**: `bug-fix-medium`
44
+
45
+ Fix logic bugs with visible and hidden test cases.
46
+
47
+ - **Difficulty**: Medium
48
+ - **Goal**: Repair a logic error in invoice calculation
49
+ - **Starter Code**: Function that returns wrong total (returns subtotal instead of discounted)
50
+ - **Grading**: Test pass fraction (visible & hidden)
51
+ - **Score Range**: 0.0–1.0
52
+
53
+ ### 3. 🔴 Hard: Optimization & Refactoring
54
+
55
+ **Task ID**: `optimization-hard`
56
+
57
+ Optimize inefficient code while maintaining correctness.
58
+
59
+ - **Difficulty**: Hard
60
+ - **Goal**: Convert O(n²) duplicate removal to O(n) with set
61
+ - **Starter Code**: Slow nested-loop implementation
62
+ - **Grading**: 50% correctness + 30% speedup + 15% code quality + 5% style
63
+ - **Score Range**: 0.0–1.0
64
+ - **Bonus**: Runtime benchmarking against reference implementation
65
+
66
+ ## Quick Start
67
+
68
+ ### Run Locally
69
+
70
+ ```bash
71
+ cd python-code-review-env
72
+ pip install -r server/requirements.txt
73
+ python -m server.app
74
+ ```
75
+
76
+ Visit http://localhost:8000/docs for interactive API
77
+
78
+ ### Run with Docker
79
+
80
+ ```bash
81
+ docker build -f server/Dockerfile -t python_code_review_env:latest .
82
+ docker run -p 8000:8000 python_code_review_env:latest
83
+ ```
84
+
85
+ ### Run Inference
86
+
87
+ ```bash
88
+ python inference.py --model "gpt-3.5-turbo" --base-url "http://localhost:8000/v1"
89
+ ```
90
+
91
+ ## OpenEnv Specification
92
+
93
+ ### Observation
94
+
95
+ ```json
96
+ {
97
+ "task_id": "syntax-fix-easy",
98
+ "difficulty": "easy",
99
+ "task_description": "Fix syntax errors...",
100
+ "current_code": "def normalize_username(raw_name: str) -> str:\n cleaned = raw_name.strip().lower(\n ...",
101
+ "errors": "invalid syntax ( line 2, column 40 )",
102
+ "test_results": "Not run yet.",
103
+ "visible_tests": ["normalize_username(' Alice Smith ') == 'alice_smith'"],
104
+ "history": [],
105
+ "attempts_remaining": 8,
106
+ "score": 0.0,
107
+ "reward": {
108
+ "value": 0.0,
109
+ "reason": "Episode reset."
110
+ }
111
+ }
112
+ ```
113
+
114
+ ### Action
115
+
116
+ ```json
117
+ {
118
+ "action_type": "edit_code",
119
+ "code": "def normalize_username(raw_name: str) -> str:\n cleaned = raw_name.strip().lower()\n if not cleaned:\n return \"anonymous\"\n return cleaned.replace(\" \", \"_\")"
120
+ }
121
+ ```
122
+
123
+ ### Reward Details
124
+
125
+ - **+0.2**: Syntax fixed (one-time per episode)
126
+ - **+0.15**: Passing additional test (cumulative per test)
127
+ - **+0.1**: Code quality improvement
128
+ - **+0.5**: Full correctness (100% hidden tests, one-time)
129
+ - **-0.1**: Invalid action
130
+
131
+ ## Architecture
132
+
133
+ ```
134
+ python_code_review_env/
135
+ ├── models.py # Pydantic models (Observation, Action, Reward)
136
+ ├── server/
137
+ │ ├── app.py # FastAPI server
138
+ │ ├── env.py # OpenEnv environment
139
+ │ ├── Dockerfile # Docker config
140
+ │ └── requirements.txt
141
+ ├── graders/
142
+ │ ├── common.py # Shared utilities
143
+ │ ├── syntax.py # Syntax/bug graders
144
+ │ ├── optimization.py# Optimization grader
145
+ │ └── pytest_runner.py
146
+ ├── tasks/
147
+ │ ├── task_bank.py # 3 deterministic tasks
148
+ │ └── __init__.py
149
+ ├── inference.py # Baseline evaluation script
150
+ ├── openenv.yaml # OpenEnv spec
151
+ ├── pyproject.toml # Project metadata
152
+ └── README.md
153
+ ```
154
+
155
+ ## FastAPI Endpoints
156
+
157
+ - `GET /health` – Health check
158
+ - `GET /tasks` List all tasks
159
+ - `GET /tasks/{task_id}` – Get task details
160
+ - `POST /tasks/{task_id}/grade` – Grade code offline
161
+ - Standard OpenEnv endpoints (`/reset`, `/step`, `/state`)
162
+
163
+ ## Deterministic Graders
164
+
165
+ ### Syntax Fix
166
+ ```
167
+ if code compiles:
168
+ score = 1.0
169
+ else:
170
+ score = 0.15 + 0.55 * similarity_to_reference
171
+ ```
172
+
173
+ ### Bug Fix
174
+ ```
175
+ score = test_pass_fraction (0.0 to 1.0)
176
+ ```
177
+
178
+ ### Optimization
179
+ ```
180
+ score = (
181
+ 0.5 * test_fraction +
182
+ 0.3 * speedup_score +
183
+ 0.15 * code_quality +
184
+ 0.05 * pep8_style
185
+ )
186
+ ```
187
+
188
+ ## Examples
189
+
190
+ ### Using Python
191
+
192
+ ```python
193
+ from server.env import PythonCodeReviewEnvironment
194
+ from models import PythonCodeReviewAction
195
+
196
+ env = PythonCodeReviewEnvironment()
197
+ obs = env.reset(task_id="syntax-fix-easy")
198
+
199
+ action = PythonCodeReviewAction(
200
+ action_type="edit_code",
201
+ code="""def normalize_username(raw_name: str) -> str:
202
+ cleaned = raw_name.strip().lower()
203
+ if not cleaned:
204
+ return "anonymous"
205
+ return cleaned.replace(" ", "_")
206
+ """
207
+ )
208
+
209
+ obs = env.step(action)
210
+ print(f"Score: {obs.score}")
211
+ print(f"Reward: {obs.reward.value:+.3f}")
212
+ ```
213
+
214
+ ### Using cURL
215
+
216
+ ```bash
217
+ # Check health
218
+ curl http://localhost:8000/health
219
+
220
+ # List tasks
221
+ curl http://localhost:8000/tasks
222
+
223
+ # Grade code
224
+ curl -X POST http://localhost:8000/tasks/syntax-fix-easy/grade \
225
+ -H "Content-Type: application/json" \
226
+ -d '{"action_type": "edit_code", "code": "..."}'
227
+ ```
228
+
229
+ ## Deployment
230
+
231
+ ### Hugging Face Spaces
232
+
233
+ 1. Create Space > Docker
234
+ 2. Upload files + `server/Dockerfile`
235
+ 3. Space auto-deploys on CPU
236
+ 4. Monitor `/health` endpoint
237
+
238
+ ### Local Docker
239
+
240
+ ```bash
241
+ docker build -f server/Dockerfile -t python_code_review_env .
242
+ docker run -p 8000:8000 \
243
+ -e MAX_CONCURRENT_ENVS=16 \
244
+ python_code_review_env
245
+ ```
246
+
247
+ ## Performance
248
+
249
+ - Startup: < 5s
250
+ - Reset: < 100ms
251
+ - Step: 50ms–3s (depends on action)
252
+ - Inference (3 tasks): < 20 minutes
253
+ - CPU: Works on 2 vCPU, 8GB RAM
254
+
255
+ ## Validation Checklist
256
+
257
+ - 3 deterministic tasks
258
+ - Deterministic graders (AST, pytest, benchmarks)
259
+ - `/health` 200
260
+ - ✅ Scores vary per task (not constant)
261
+ - Docker builds successfully
262
+ - OpenEnv spec compliant
263
+ - Reward shaping working
264
+ - All tests deterministic and reproducible
265
+
266
+ ## License
267
+
268
+ MIT
269
+
270
+ ---
271
+
272
+ **Built for production. Deterministic. Deployable. Extensible.**
REWARD_SYSTEM_GUIDE.md ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Reward System Implementation Guide
2
+
3
+ This document shows how the reward system is implemented in code and how to use it.
4
+
5
+ ## Module Documentation
6
+
7
+ The reward system architecture is documented at the module level:
8
+
9
+ ```python
10
+ import server.env
11
+ print(server.env.__doc__)
12
+ ```
13
+
14
+ Output shows all 6 reward components and the final computation formula.
15
+
16
+ ## Reward Constants
17
+
18
+ All reward constants are defined in `server/env.py` (lines 57-87):
19
+
20
+ ```python
21
+ # Component 1: Score improvement reward
22
+ PROGRESS_SCALE = 0.25
23
+
24
+ # Component 2: Syntax/compilation fix reward
25
+ SYNTAX_FIX_BONUS = 0.35
26
+
27
+ # Component 3: Test improvement reward
28
+ TEST_PASS_REWARD_SCALE = 0.30
29
+
30
+ # Component 4: Code quality reward
31
+ QUALITY_BONUS_SCALE = 0.15
32
+
33
+ # Component 5: Stagnation penalty
34
+ STAGNATION_PENALTY = 0.10
35
+
36
+ # Component 6: Regression penalty
37
+ REGRESSION_PENALTY_SCALE = 0.20
38
+
39
+ # One-time completion bonus
40
+ COMPLETION_BONUS = 0.50
41
+
42
+ # Invalid/error penalties
43
+ INVALID_ACTION_PENALTY = 0.15
44
+ TIMEOUT_PENALTY = 0.15
45
+ ```
46
+
47
+ To tune the reward system, edit these constants and re-test.
48
+
49
+ ## RewardDetails Model Documentation
50
+
51
+ Located in `models.py` (lines 26-80):
52
+
53
+ ```python
54
+ from models import RewardDetails
55
+ print(RewardDetails.__doc__)
56
+ ```
57
+
58
+ Shows all 15 fields and their meanings:
59
+ - `value`: Final scalar reward [-1.0, +1.0]
60
+ - `progress_delta`: Score improvement component
61
+ - `syntax_reward`: Syntax fix bonus
62
+ - `test_reward`: Test improvement bonus
63
+ - `quality_bonus`: Code quality improvement
64
+ - `stagnation_penalty`: Unchanged code penalty
65
+ - `regression_penalty`: Score decline penalty
66
+ - `reason`: Human-readable explanation
67
+ - `prev_score`, `curr_score`: Score before/after
68
+ - `code_changed`: Whether code was modified
69
+
70
+ ## Core Computation Method
71
+
72
+ The main reward computation is in `_compute_reward_components()` (server/env.py, lines 507-703):
73
+
74
+ ```python
75
+ def _compute_reward_components(
76
+ self,
77
+ curr_score: float,
78
+ prev_score: float,
79
+ curr_grade: TaskGrade,
80
+ code_changed: bool,
81
+ prev_grade_score: float = 0.0,
82
+ ) -> dict:
83
+ """Compute all six reward components and return combined result."""
84
+ ```
85
+
86
+ ### What It Does
87
+
88
+ 1. **Initializes** empty component dict
89
+ 2. **Computes each component**:
90
+ - Progress: Score improvement scaled by PROGRESS_SCALE
91
+ - Syntax: One-time bonus if first compile
92
+ - Test: Test pass rate improvement scaled by TEST_PASS_REWARD_SCALE
93
+ - Quality: Code quality improvement scaled by QUALITY_BONUS_SCALE
94
+ - Stagnation: Penalty if code unchanged
95
+ - Regression: Penalty if score decreased
96
+ 3. **Combines**: Sums positives, subtracts negatives
97
+ 4. **Clamps**: Bounds result to [-1.0, +1.0]
98
+
99
+ ### Key Design Decisions
100
+
101
+ - **Monotonic tracking**: Best test rate and quality in episode are tracked
102
+ - **One-time bonuses**: Syntax reward awarded once per episode
103
+ - **Scale capping**: Each component has a maximum (e.g., progress max +0.25)
104
+ - **Timeout handling**: Special penalty instead of score-based
105
+ - **Clamping**: Final reward bounded for numerical stability
106
+
107
+ ## Debug Logging
108
+
109
+ When `verbose=True`, the environment prints detailed debug output via `_log_debug_step()`:
110
+
111
+ ```python
112
+ env = PythonCodeReviewEnvironment(verbose=True)
113
+ obs = env.reset()
114
+ obs = env.step(action)
115
+ ```
116
+
117
+ Output format:
118
+ ```
119
+ Step 1 | Score: 0.698 | Delta: +0.698 | Reward: +0.4239 | Changed: False
120
+ | Progress=+0.174 | Quality=+0.149 | Stagnation=+0.100
121
+ | Reason: Syntax error detected: '(' was never closed
122
+ ```
123
+
124
+ Shows:
125
+ - Step number
126
+ - Current score and delta from previous
127
+ - Final reward value
128
+ - Whether code changed
129
+ - Non-zero components only
130
+ - Human-readable reason
131
+
132
+ ## Example: Full Episode with Rewards
133
+
134
+ ```python
135
+ from server.env import PythonCodeReviewEnvironment
136
+ from models import PythonCodeReviewAction
137
+
138
+ env = PythonCodeReviewEnvironment(verbose=True)
139
+ obs = env.reset(task_id='syntax-fix-easy')
140
+
141
+ # Step 1: Analyze (no code change)
142
+ action = PythonCodeReviewAction(action_type='analyze_code')
143
+ obs = env.step(action)
144
+ print(f"Reward 1: {obs.reward_details.value:.4f}")
145
+
146
+ # Step 2: Edit with fix
147
+ code = 'x = 1; y = 2; print(x + y)'
148
+ action = PythonCodeReviewAction(action_type='edit_code', code=code)
149
+ obs = env.step(action)
150
+ print(f"Reward 2: {obs.reward_details.value:.4f}")
151
+
152
+ # Step 3: Submit
153
+ action = PythonCodeReviewAction(action_type='submit_solution')
154
+ obs = env.step(action)
155
+ print(f"Final Reward: {obs.reward_details.value:.4f}")
156
+ ```
157
+
158
+ ## Interpreting Rewards
159
+
160
+ ### Positive Rewards (+0 to +1.0)
161
+ - **+0.5 - +1.0**: Major progress (syntax fix, many tests passing)
162
+ - **+0.2 - +0.5**: Good progress (score improvement, test gains)
163
+ - **+0.0 - +0.2**: Small progress (quality improvement, minor gains)
164
+
165
+ ### Negative Rewards (−1.0 to −0)
166
+ - **−0.1 - 0**: Stagnation (analyzed without changing code)
167
+ - **−0.2 - −0.1**: Slight regression (small score drop)
168
+ - **−0.5 - −0.2**: Major regression (significant score drop)
169
+ - **−1.0 - −0.5**: Invalid action or timeout
170
+
171
+ ## Tuning the Reward System
172
+
173
+ ### For Faster Early Learning
174
+ ↑ Increase `SYNTAX_FIX_BONUS` and `COMPLETION_BONUS`
175
+
176
+ ### To Encourage Editing Over Analysis
177
+ ↑ Increase `STAGNATION_PENALTY`
178
+
179
+ ### To Reward Test Improvements More
180
+ ↑ Increase `TEST_PASS_REWARD_SCALE`
181
+
182
+ ### To Penalize Mistakes More
183
+ ↑ Increase `REGRESSION_PENALTY_SCALE`
184
+
185
+ ### To Balance All Components
186
+ Adjust the Scale constants (all in range 0.15-0.35 for stability)
187
+
188
+ ## Accessing Documentation Programmatically
189
+
190
+ ```python
191
+ from server.env import PythonCodeReviewEnvironment
192
+ from models import RewardDetails
193
+ import server.env
194
+
195
+ # Module-level architecture
196
+ print(server.env.__doc__)
197
+
198
+ # RewardDetails fields
199
+ print(RewardDetails.__doc__)
200
+
201
+ # One method
202
+ env = PythonCodeReviewEnvironment()
203
+ help(env._compute_reward_components)
204
+ ```
205
+
206
+ All major functions and classes have comprehensive docstrings.
__init__.py CHANGED
@@ -1,16 +1,40 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the BSD-style license found in the
5
- # LICENSE file in the root directory of this source tree.
6
 
7
- """Python Env Environment."""
8
-
9
- from .client import PythonEnv
10
- from .models import PythonAction, PythonObservation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  __all__ = [
13
- "PythonAction",
14
- "PythonObservation",
15
  "PythonEnv",
 
 
 
 
 
 
 
 
 
 
16
  ]
 
1
+ """Public package API for the Python code review OpenEnv benchmark."""
 
 
 
 
2
 
3
+ try:
4
+ from .client import CodeReviewEnv, MyEnv, PythonEnv
5
+ from .models import (
6
+ HealthResponse,
7
+ HistoryEntry,
8
+ PythonCodeReviewAction,
9
+ PythonCodeReviewObservation,
10
+ PythonCodeReviewState,
11
+ RewardDetails,
12
+ TaskDescriptor,
13
+ TaskGrade,
14
+ )
15
+ except ImportError: # pragma: no cover
16
+ from client import CodeReviewEnv, MyEnv, PythonEnv
17
+ from models import (
18
+ HealthResponse,
19
+ HistoryEntry,
20
+ PythonCodeReviewAction,
21
+ PythonCodeReviewObservation,
22
+ PythonCodeReviewState,
23
+ RewardDetails,
24
+ TaskDescriptor,
25
+ TaskGrade,
26
+ )
27
 
28
  __all__ = [
 
 
29
  "PythonEnv",
30
+ "CodeReviewEnv",
31
+ "MyEnv",
32
+ "PythonCodeReviewAction",
33
+ "PythonCodeReviewObservation",
34
+ "PythonCodeReviewState",
35
+ HealthResponse,
36
+ HistoryEntry,
37
+ RewardDetails,
38
+ TaskDescriptor,
39
+ TaskGrade,
40
  ]
client.py CHANGED
@@ -1,46 +1,75 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the BSD-style license found in the
5
- # LICENSE file in the root directory of this source tree.
6
 
7
- """Python Env Environment Client."""
8
 
9
- from typing import Any, Dict
 
 
10
 
11
  from openenv.core import EnvClient
12
  from openenv.core.client_types import StepResult
13
- from openenv.core.env_server.types import State
14
-
15
- try:
16
- from .models import PythonAction, PythonObservation
17
- except ImportError:
18
- from models import PythonAction, PythonObservation # type: ignore
19
-
20
-
21
- class PythonEnv(EnvClient[PythonAction, PythonObservation, State]):
22
- """Typed client for the Python code-review environment."""
23
-
24
- def _step_payload(self, action: PythonAction) -> Dict[str, Any]:
25
- """Convert a validated action model to the JSON payload expected by the server."""
26
-
27
- return action.model_dump(exclude_none=True)
28
-
29
- def _parse_result(self, payload: Dict[str, Any]) -> StepResult[PythonObservation]:
30
- """Parse a server response into a typed step result."""
31
-
32
- obs_data = dict(payload.get("observation", {}))
33
- obs_data.setdefault("done", payload.get("done", False))
34
- obs_data.setdefault("reward", payload.get("reward"))
35
- observation = PythonObservation.model_validate(obs_data)
36
-
37
- return StepResult(
38
- observation=observation,
39
- reward=payload.get("reward"),
40
- done=payload.get("done", False),
41
- )
42
-
43
- def _parse_state(self, payload: Dict[str, Any]) -> State:
44
- """Parse the server state payload into the shared state model."""
45
-
46
- return State.model_validate(payload)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Client for the Python code review environment."""
2
+
3
+ from __future__ import annotations
 
 
4
 
5
+ from typing import Dict
6
 
7
+ from compat import install_openenv_fastmcp_compat
8
+
9
+ install_openenv_fastmcp_compat()
10
 
11
  from openenv.core import EnvClient
12
  from openenv.core.client_types import StepResult
13
+
14
+ from models import (
15
+ HistoryEntry,
16
+ PythonCodeReviewAction,
17
+ PythonCodeReviewObservation,
18
+ PythonCodeReviewState,
19
+ RewardDetails,
20
+ )
21
+
22
+
23
+ class PythonEnv(
24
+ EnvClient[PythonCodeReviewAction, PythonCodeReviewObservation, PythonCodeReviewState]
25
+ ):
26
+ """OpenEnv HTTP client for the Python code review benchmark."""
27
+
28
+ def _step_payload(self, action: PythonCodeReviewAction) -> Dict:
29
+ return action.model_dump(exclude_none=True)
30
+
31
+ def _parse_result(self, payload: Dict) -> StepResult[PythonCodeReviewObservation]:
32
+ obs = payload.get("observation", {})
33
+ observation = PythonCodeReviewObservation(
34
+ task_id=obs["task_id"],
35
+ title=obs["title"],
36
+ difficulty=obs["difficulty"],
37
+ task_kind=obs["task_kind"],
38
+ task_description=obs["task_description"],
39
+ current_code=obs.get("current_code", ""),
40
+ errors=obs.get("errors", ""),
41
+ test_results=obs.get("test_results", ""),
42
+ history=[HistoryEntry(**entry) for entry in obs.get("history", [])],
43
+ attempts_remaining=obs.get("attempts_remaining", 0),
44
+ last_action_status=obs.get("last_action_status", ""),
45
+ score=obs.get("score", 0.0),
46
+ reward_details=RewardDetails(**obs.get("reward_details", {})),
47
+ done=payload.get("done", obs.get("done", False)),
48
+ reward=payload.get("reward", obs.get("reward")),
49
+ metadata=obs.get("metadata", {}),
50
+ )
51
+ return StepResult(
52
+ observation=observation,
53
+ reward=payload.get("reward", obs.get("reward")),
54
+ done=payload.get("done", obs.get("done", False)),
55
+ )
56
+
57
+ def _parse_state(self, payload: Dict) -> PythonCodeReviewState:
58
+ return PythonCodeReviewState(
59
+ episode_id=payload.get("episode_id"),
60
+ step_count=payload.get("step_count", 0),
61
+ task_id=payload.get("task_id"),
62
+ difficulty=payload.get("difficulty"),
63
+ task_kind=payload.get("task_kind"),
64
+ attempts_remaining=payload.get("attempts_remaining", 0),
65
+ current_code=payload.get("current_code", ""),
66
+ errors=payload.get("errors", ""),
67
+ test_results=payload.get("test_results", ""),
68
+ history=[HistoryEntry(**entry) for entry in payload.get("history", [])],
69
+ score=payload.get("score", 0.0),
70
+ done=payload.get("done", False),
71
+ )
72
+
73
+
74
+ CodeReviewEnv = PythonEnv
75
+ MyEnv = PythonEnv
compat.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Compatibility helpers for OpenEnv and FastMCP runtime drift."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+ import types
7
+ from typing import Any, Optional
8
+
9
+
10
+ def install_openenv_fastmcp_compat() -> None:
11
+ """Patch FastMCP API differences so older OpenEnv builds keep importing."""
12
+ try:
13
+ import fastmcp # type: ignore
14
+ except Exception:
15
+ return
16
+
17
+ try:
18
+ if not hasattr(fastmcp, "Client"):
19
+ class CompatClient:
20
+ """Minimal async MCP client used for legacy OpenEnv imports."""
21
+
22
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
23
+ self.args = args
24
+ self.kwargs = kwargs
25
+
26
+ async def __aenter__(self) -> "CompatClient":
27
+ return self
28
+
29
+ async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> bool:
30
+ return False
31
+
32
+ async def list_tools(self) -> list[Any]:
33
+ return []
34
+
35
+ async def call_tool(self, tool_name: str, arguments: dict[str, Any]) -> Any:
36
+ raise RuntimeError(
37
+ f"MCP client compatibility mode cannot call tool: {tool_name}"
38
+ )
39
+
40
+ fastmcp.Client = CompatClient # type: ignore[attr-defined]
41
+ except Exception:
42
+ pass
43
+
44
+ try:
45
+ client_pkg = sys.modules.get("fastmcp.client")
46
+ if client_pkg is None:
47
+ client_pkg = types.ModuleType("fastmcp.client")
48
+ sys.modules["fastmcp.client"] = client_pkg
49
+
50
+ client_mod = sys.modules.get("fastmcp.client.client")
51
+ if client_mod is None:
52
+ client_mod = types.ModuleType("fastmcp.client.client")
53
+ sys.modules["fastmcp.client.client"] = client_mod
54
+
55
+ if not hasattr(client_mod, "CallToolResult"):
56
+ class CallToolResult:
57
+ """Compatibility container for legacy OpenEnv response handling."""
58
+
59
+ def __init__(
60
+ self,
61
+ content: Any = None,
62
+ structured_content: Any = None,
63
+ meta: Any = None,
64
+ data: Any = None,
65
+ is_error: bool = False,
66
+ ) -> None:
67
+ self.content = content
68
+ self.structured_content = structured_content
69
+ self.meta = meta
70
+ self.data = data
71
+ self.is_error = is_error
72
+
73
+ client_mod.CallToolResult = CallToolResult
74
+
75
+ client_pkg.client = client_mod # type: ignore[attr-defined]
76
+ except Exception:
77
+ pass
78
+
79
+
80
+ install_openenv_fastmcp_compat()
81
+
82
+
83
+ try:
84
+ from openenv.core.env_server.http_server import create_app as openenv_create_app
85
+ from openenv.core.env_server.interfaces import Environment
86
+ from openenv.core.env_server.types import Action, Observation, State
87
+ except Exception as exc: # pragma: no cover
88
+ raise RuntimeError(f"OpenEnv runtime import failed after compatibility patch: {exc}") from exc
89
+
90
+
91
+ create_app = openenv_create_app
92
+
examples/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Example snippets for the Python review environment."""
examples/python_review_examples.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Example Python snippets for exercising the review environment."""
2
+
3
+ EXAMPLE_SNIPPETS = {
4
+ "unsafe_eval": "\n".join(
5
+ [
6
+ "def load_settings(config_text):",
7
+ " return eval(config_text)",
8
+ ]
9
+ ),
10
+ "mutable_default": "\n".join(
11
+ [
12
+ "def append_name(name, names=[]):",
13
+ " names.append(name)",
14
+ " return names",
15
+ ]
16
+ ),
17
+ "bare_except": "\n".join(
18
+ [
19
+ "def publish_report(report):",
20
+ " try:",
21
+ ' return report[\"summary\"]',
22
+ " except:",
23
+ " return None",
24
+ ]
25
+ ),
26
+ "shell_injection": "\n".join(
27
+ [
28
+ "import subprocess",
29
+ "",
30
+ "def run_script(script_path, user_input):",
31
+ ' cmd = f\"python {script_path} {user_input}\"',
32
+ " return subprocess.check_output(cmd, shell=True, text=True)",
33
+ ]
34
+ ),
35
+ "syntax_error": "\n".join(
36
+ [
37
+ "def broken_function(",
38
+ " return 42",
39
+ ]
40
+ ),
41
+ "clean_function": "\n".join(
42
+ [
43
+ "def normalize_name(name: str) -> str:",
44
+ " cleaned = name.strip().lower()",
45
+ " return cleaned.replace(\" \", \" \")",
46
+ ]
47
+ ),
48
+ }
49
+
50
+
51
+ EXPECTED_RULE_IDS = {
52
+ "unsafe_eval": {"avoid-eval"},
53
+ "mutable_default": {"mutable-default-list"},
54
+ "bare_except": {"bare-except"},
55
+ "shell_injection": {"shell-true-command-injection"},
56
+ "syntax_error": {"syntax-error"},
57
+ "clean_function": set(),
58
+ }
graders/__init__.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Deterministic graders for the Python code review environment."""
2
+
3
+ from .common import clamp_score
4
+ from .optimization import grade_optimization_task
5
+ from .pytest_runner import PytestExecution, run_pytest_suite
6
+ from .syntax import grade_bug_fix_task, grade_syntax_task, grade_task
7
+
8
+ __all__ = [
9
+ "PytestExecution",
10
+ "clamp_score",
11
+ "grade_bug_fix_task",
12
+ "grade_optimization_task",
13
+ "grade_syntax_task",
14
+ "grade_task",
15
+ "run_pytest_suite",
16
+ ]
graders/common.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Shared deterministic scoring helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import ast
6
+ import difflib
7
+ import traceback
8
+ from typing import Tuple
9
+
10
+
11
+ def clamp_score(value: float) -> float:
12
+ """Clamp any scalar score into the required 0..1 interval."""
13
+
14
+ return max(0.0, min(1.0, round(value, 6)))
15
+
16
+
17
+ def syntax_error_message(code: str) -> str:
18
+ """Return a concise syntax error string or an empty string."""
19
+
20
+ try:
21
+ ast.parse(code)
22
+ except SyntaxError as exc:
23
+ return f"{exc.msg} (line {exc.lineno}, column {exc.offset})"
24
+ except Exception: # pragma: no cover
25
+ return traceback.format_exc(limit=1).strip()
26
+ return ""
27
+
28
+
29
+ def compiles(code: str) -> bool:
30
+ """Return whether the code parses and compiles."""
31
+
32
+ try:
33
+ compile(code, "<candidate>", "exec")
34
+ except Exception:
35
+ return False
36
+ return True
37
+
38
+
39
+ def normalized_diff_score(code: str, reference_code: str) -> float:
40
+ """Score textual similarity to the reference solution."""
41
+
42
+ ratio = difflib.SequenceMatcher(
43
+ a="".join(code.split()),
44
+ b="".join(reference_code.split()),
45
+ ).ratio()
46
+ return clamp_score(ratio)
47
+
48
+
49
+ def style_score(code: str, max_line_length: int = 88) -> float:
50
+ """Simple deterministic PEP8-inspired style score."""
51
+
52
+ lines = code.splitlines() or [""]
53
+ line_length_ok = sum(1 for line in lines if len(line) <= max_line_length) / len(lines)
54
+ tab_ok = 1.0 if all("\t" not in line for line in lines) else 0.0
55
+ trailing_ws_ok = 1.0 if all(line == line.rstrip() for line in lines) else 0.0
56
+ return clamp_score((line_length_ok * 0.6) + (tab_ok * 0.2) + (trailing_ws_ok * 0.2))
57
+
58
+
59
+ def nested_loop_depth(tree: ast.AST) -> int:
60
+ """Return the maximum nested loop depth in the AST."""
61
+
62
+ best = 0
63
+
64
+ def walk(node: ast.AST, depth: int) -> None:
65
+ nonlocal best
66
+ if isinstance(node, (ast.For, ast.AsyncFor, ast.While)):
67
+ depth += 1
68
+ best = max(best, depth)
69
+ for child in ast.iter_child_nodes(node):
70
+ walk(child, depth)
71
+
72
+ walk(tree, 0)
73
+ return best
74
+
75
+
76
+ def compile_tree(code: str) -> Tuple[ast.AST | None, str]:
77
+ """Return AST tree and optional parse error."""
78
+
79
+ try:
80
+ return ast.parse(code), ""
81
+ except SyntaxError as exc:
82
+ return None, f"{exc.msg} (line {exc.lineno}, column {exc.offset})"
graders/optimization.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Deterministic grading for optimization and refactor tasks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import subprocess
7
+ import sys
8
+ import tempfile
9
+ from pathlib import Path
10
+
11
+ from graders.common import clamp_score, compile_tree, nested_loop_depth, style_score
12
+ from graders.pytest_runner import run_pytest_suite
13
+ from models import TaskGrade
14
+ from tasks.task_bank import TaskSpec
15
+
16
+
17
+ def _benchmark_script(task: TaskSpec) -> str:
18
+ return f"""import json
19
+ import time
20
+ from candidate import {task.benchmark_entrypoint}
21
+
22
+ {task.benchmark_builder}
23
+
24
+ events = build_benchmark_events()
25
+ start = time.perf_counter()
26
+ for _ in range({task.benchmark_repeats}):
27
+ result = {task.benchmark_entrypoint}(events)
28
+ elapsed = time.perf_counter() - start
29
+ Path = __import__("pathlib").Path
30
+ Path("benchmark.json").write_text(json.dumps({{"elapsed": elapsed, "rows": len(result)}}), encoding="utf-8")
31
+ """
32
+
33
+
34
+ def benchmark_runtime(candidate_code: str, task: TaskSpec) -> tuple[float, bool, str]:
35
+ """Benchmark runtime deterministically against the starter implementation."""
36
+
37
+ assert task.benchmark_entrypoint is not None
38
+ try:
39
+ with tempfile.TemporaryDirectory(prefix="python-code-review-bench-") as temp_dir:
40
+ temp_path = Path(temp_dir)
41
+ (temp_path / "candidate.py").write_text(candidate_code, encoding="utf-8")
42
+ (temp_path / "starter.py").write_text(task.starter_code, encoding="utf-8")
43
+ (temp_path / "candidate_runner.py").write_text(_benchmark_script(task), encoding="utf-8")
44
+
45
+ starter_script = _benchmark_script(task).replace("from candidate import", "from starter import")
46
+ (temp_path / "starter_runner.py").write_text(starter_script, encoding="utf-8")
47
+
48
+ try:
49
+ starter_run = subprocess.run(
50
+ [sys.executable, "starter_runner.py"],
51
+ cwd=temp_path,
52
+ capture_output=True,
53
+ text=True,
54
+ timeout=task.benchmark_timeout_s,
55
+ check=False,
56
+ )
57
+ starter_payload = json.loads((temp_path / "benchmark.json").read_text(encoding="utf-8"))
58
+
59
+ candidate_run = subprocess.run(
60
+ [sys.executable, "candidate_runner.py"],
61
+ cwd=temp_path,
62
+ capture_output=True,
63
+ text=True,
64
+ timeout=task.benchmark_timeout_s,
65
+ check=False,
66
+ )
67
+ candidate_payload = json.loads((temp_path / "benchmark.json").read_text(encoding="utf-8"))
68
+ except subprocess.TimeoutExpired as exc:
69
+ output = (exc.stdout or "") + (exc.stderr or "")
70
+ return 0.0, True, (output or "benchmark timed out").strip()
71
+ except Exception as exc: # pragma: no cover
72
+ return 0.0, False, str(exc)
73
+
74
+ starter_elapsed = max(float(starter_payload["elapsed"]), 1e-9)
75
+ candidate_elapsed = max(float(candidate_payload["elapsed"]), 1e-9)
76
+ speedup = starter_elapsed / candidate_elapsed
77
+ runtime_score = clamp_score(min((speedup - 1.0) / 3.0, 1.0))
78
+ output = "\n".join(
79
+ part
80
+ for part in [
81
+ starter_run.stdout.strip(),
82
+ starter_run.stderr.strip(),
83
+ candidate_run.stdout.strip(),
84
+ candidate_run.stderr.strip(),
85
+ f"starter={starter_elapsed:.6f}s candidate={candidate_elapsed:.6f}s speedup={speedup:.2f}x",
86
+ ]
87
+ if part
88
+ )
89
+ return runtime_score, False, output
90
+ except Exception as exc: # pragma: no cover
91
+ return 0.0, False, str(exc)
92
+
93
+
94
+ def ast_quality_score(code: str, task: TaskSpec) -> float:
95
+ """Score maintainability and algorithmic structure."""
96
+
97
+ tree, parse_error = compile_tree(code)
98
+ if tree is None:
99
+ return 0.0
100
+
101
+ import ast
102
+
103
+ function_node = next(
104
+ (node for node in tree.body if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef))),
105
+ None,
106
+ )
107
+ docstring_points = 0.2 if function_node and ast.get_docstring(function_node, clean=False) else 0.0
108
+ nested_points = 0.4 if nested_loop_depth(tree) <= 1 else 0.0
109
+ marker_points = 0.0
110
+ for marker in task.expected_quality_markers:
111
+ if marker in code:
112
+ marker_points += 0.2
113
+ return clamp_score(docstring_points + nested_points + marker_points)
114
+
115
+
116
+ def grade_optimization_task(candidate_code: str, task: TaskSpec) -> TaskGrade:
117
+ """Grade optimization tasks using correctness, runtime, AST quality, and style."""
118
+
119
+ execution = run_pytest_suite(
120
+ candidate_code,
121
+ [*task.visible_tests, *task.hidden_tests],
122
+ timeout_s=task.benchmark_timeout_s,
123
+ )
124
+ test_fraction = execution.passed / execution.total if execution.total else 0.0
125
+
126
+ if execution.timed_out:
127
+ return TaskGrade(
128
+ score=0.0,
129
+ tests_passed=execution.passed,
130
+ tests_total=execution.total,
131
+ timed_out=True,
132
+ details={"tests": execution.output},
133
+ )
134
+
135
+ runtime_score, timed_out, benchmark_output = benchmark_runtime(candidate_code, task)
136
+ if timed_out:
137
+ return TaskGrade(
138
+ score=0.0,
139
+ tests_passed=execution.passed,
140
+ tests_total=execution.total,
141
+ timed_out=True,
142
+ details={"tests": execution.output, "benchmark": benchmark_output},
143
+ )
144
+
145
+ quality_score = ast_quality_score(candidate_code, task)
146
+ pep8_score = style_score(candidate_code, task.style_max_line_length)
147
+ score = clamp_score(
148
+ (0.5 * test_fraction)
149
+ + (0.3 * runtime_score)
150
+ + (0.15 * quality_score)
151
+ + (0.05 * pep8_score)
152
+ )
153
+ return TaskGrade(
154
+ score=score,
155
+ syntax_score=1.0,
156
+ tests_passed=execution.passed,
157
+ tests_total=execution.total,
158
+ quality_score=quality_score,
159
+ runtime_score=runtime_score,
160
+ details={
161
+ "tests": execution.output,
162
+ "benchmark": benchmark_output,
163
+ "test_fraction": round(test_fraction, 4),
164
+ "runtime_score": round(runtime_score, 4),
165
+ "style_score": round(pep8_score, 4),
166
+ },
167
+ )
graders/pytest_runner.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Helpers for deterministic pytest execution in temp sandboxes."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import subprocess
7
+ import sys
8
+ import tempfile
9
+ from dataclasses import dataclass
10
+ from pathlib import Path
11
+ from typing import Iterable
12
+
13
+
14
+ @dataclass(frozen=True)
15
+ class PytestExecution:
16
+ """Exact pytest execution summary."""
17
+
18
+ passed: int
19
+ failed: int
20
+ total: int
21
+ timed_out: bool
22
+ output: str
23
+
24
+
25
+ def _test_module_source(tests: Iterable[str]) -> str:
26
+ """Build a valid pytest module from expression-style or full test snippets."""
27
+ blocks: list[str] = ["from candidate import * # noqa: F401,F403"]
28
+ for index, test in enumerate(tests, start=1):
29
+ snippet = str(test).strip()
30
+ if not snippet:
31
+ continue
32
+ if snippet.startswith("def test_"):
33
+ blocks.append(snippet)
34
+ continue
35
+ blocks.append(
36
+ "\n".join(
37
+ [
38
+ f"def test_case_{index:03d}():",
39
+ f" assert {snippet}",
40
+ ]
41
+ )
42
+ )
43
+ return "\n\n".join(blocks) or "def test_placeholder():\n assert True\n"
44
+
45
+
46
+ def _runner_script() -> str:
47
+ return """import json
48
+ import pathlib
49
+ import pytest
50
+
51
+
52
+ class Collector:
53
+ def __init__(self) -> None:
54
+ self.passed = 0
55
+ self.failed = 0
56
+
57
+ def pytest_runtest_logreport(self, report):
58
+ if report.when != "call":
59
+ return
60
+ if report.passed:
61
+ self.passed += 1
62
+ elif report.failed:
63
+ self.failed += 1
64
+
65
+
66
+ collector = Collector()
67
+ exit_code = pytest.main(["-q", "test_candidate.py"], plugins=[collector])
68
+ payload = {
69
+ "passed": collector.passed,
70
+ "failed": collector.failed,
71
+ "exit_code": int(exit_code),
72
+ }
73
+ pathlib.Path("pytest_results.json").write_text(json.dumps(payload), encoding="utf-8")
74
+ """
75
+
76
+
77
+ def run_pytest_suite(candidate_code: str, tests: Iterable[str], timeout_s: float = 3.0) -> PytestExecution:
78
+ """Run a pytest suite against candidate.py and return structured results."""
79
+
80
+ test_cases = list(tests)
81
+ try:
82
+ with tempfile.TemporaryDirectory(prefix="python-code-review-") as temp_dir:
83
+ temp_path = Path(temp_dir)
84
+ (temp_path / "candidate.py").write_text(candidate_code, encoding="utf-8")
85
+ (temp_path / "test_candidate.py").write_text(_test_module_source(test_cases), encoding="utf-8")
86
+ (temp_path / "runner.py").write_text(_runner_script(), encoding="utf-8")
87
+
88
+ try:
89
+ completed = subprocess.run(
90
+ [sys.executable, "runner.py"],
91
+ cwd=temp_path,
92
+ capture_output=True,
93
+ text=True,
94
+ timeout=timeout_s,
95
+ check=False,
96
+ )
97
+ except subprocess.TimeoutExpired as exc:
98
+ output = (exc.stdout or "") + (exc.stderr or "")
99
+ return PytestExecution(
100
+ passed=0,
101
+ failed=max(len(test_cases), 1),
102
+ total=max(len(test_cases), 1),
103
+ timed_out=True,
104
+ output=(output or "pytest timed out").strip(),
105
+ )
106
+
107
+ result_path = temp_path / "pytest_results.json"
108
+ if not result_path.exists():
109
+ output = (completed.stdout or "") + (completed.stderr or "")
110
+ total = max(len(test_cases), 1)
111
+ return PytestExecution(
112
+ passed=0,
113
+ failed=total,
114
+ total=total,
115
+ timed_out=False,
116
+ output=output.strip(),
117
+ )
118
+
119
+ try:
120
+ payload = json.loads(result_path.read_text(encoding="utf-8"))
121
+ except Exception as exc:
122
+ output = ((completed.stdout or "") + (completed.stderr or "")).strip()
123
+ return PytestExecution(
124
+ passed=0,
125
+ failed=max(len(test_cases), 1),
126
+ total=max(len(test_cases), 1),
127
+ timed_out=False,
128
+ output=(output or str(exc)).strip(),
129
+ )
130
+
131
+ passed = int(payload.get("passed", 0))
132
+ failed = int(payload.get("failed", 0))
133
+ total = max(passed + failed, len(test_cases))
134
+ output = ((completed.stdout or "") + (completed.stderr or "")).strip()
135
+ return PytestExecution(
136
+ passed=passed,
137
+ failed=failed,
138
+ total=total,
139
+ timed_out=False,
140
+ output=output,
141
+ )
142
+ except Exception as exc:
143
+ return PytestExecution(
144
+ passed=0,
145
+ failed=max(len(test_cases), 1),
146
+ total=max(len(test_cases), 1),
147
+ timed_out=False,
148
+ output=str(exc),
149
+ )
graders/syntax.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Task graders for syntax and bug-fix tasks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from graders.common import clamp_score, compiles, normalized_diff_score, style_score, syntax_error_message
6
+ from graders.optimization import grade_optimization_task
7
+ from graders.pytest_runner import run_pytest_suite
8
+ from models import TaskGrade
9
+ from tasks.task_bank import TaskSpec
10
+
11
+
12
+ def grade_syntax_task(candidate_code: str, task: TaskSpec) -> TaskGrade:
13
+ """Grade syntax repair tasks with partial credit for progress toward the reference."""
14
+
15
+ error = syntax_error_message(candidate_code)
16
+ diff_score = normalized_diff_score(candidate_code, task.reference_code)
17
+ style_base = style_score(candidate_code, task.style_max_line_length)
18
+
19
+ if not error:
20
+ return TaskGrade(
21
+ score=1.0,
22
+ syntax_score=1.0,
23
+ quality_score=style_base,
24
+ details={"compile_error": ""},
25
+ )
26
+
27
+ partial = clamp_score(0.15 + (0.55 * diff_score))
28
+ return TaskGrade(
29
+ score=partial,
30
+ syntax_score=0.0,
31
+ quality_score=diff_score * style_base,
32
+ details={"compile_error": error},
33
+ )
34
+
35
+
36
+ def grade_bug_fix_task(candidate_code: str, task: TaskSpec, include_hidden: bool = True) -> TaskGrade:
37
+ """Grade logic bug tasks with pytest pass fraction."""
38
+
39
+ if not compiles(candidate_code):
40
+ error = syntax_error_message(candidate_code)
41
+ return TaskGrade(score=0.0, syntax_score=0.0, details={"compile_error": error})
42
+
43
+ tests = list(task.visible_tests)
44
+ if include_hidden:
45
+ tests.extend(task.hidden_tests)
46
+
47
+ execution = run_pytest_suite(candidate_code, tests, timeout_s=3.0)
48
+ if execution.timed_out:
49
+ return TaskGrade(
50
+ score=0.0,
51
+ syntax_score=1.0,
52
+ tests_passed=execution.passed,
53
+ tests_total=execution.total,
54
+ timed_out=True,
55
+ details={"compile_error": "", "tests": execution.output},
56
+ )
57
+
58
+ pass_fraction = execution.passed / execution.total if execution.total else 0.0
59
+ quality = style_score(candidate_code, task.style_max_line_length)
60
+
61
+ return TaskGrade(
62
+ score=clamp_score(pass_fraction),
63
+ syntax_score=1.0,
64
+ tests_passed=execution.passed,
65
+ tests_total=execution.total,
66
+ quality_score=quality,
67
+ details={"compile_error": "", "tests": execution.output},
68
+ )
69
+
70
+
71
+ def grade_task(candidate_code: str, task: TaskSpec, include_hidden: bool = True) -> TaskGrade:
72
+ """Dispatch to the correct deterministic grader for one task."""
73
+
74
+ if task.task_kind == "syntax_fix":
75
+ return grade_syntax_task(candidate_code, task)
76
+ if task.task_kind == "bug_fix":
77
+ return grade_bug_fix_task(candidate_code, task, include_hidden=include_hidden)
78
+ return grade_optimization_task(candidate_code, task)
inference.py CHANGED
@@ -1,314 +1,462 @@
1
- """Baseline inference script for the Python code-review environment.
2
-
3
- This script is meant to be submission-friendly:
4
-
5
- - configuration comes from environment variables
6
- - model calls use the OpenAI client as required
7
- - malformed model output is handled gracefully
8
- - a JSON report is written for reproducibility
9
- """
10
-
11
- from __future__ import annotations
12
-
13
- import json
14
- import os
15
- import re
16
- from pathlib import Path
17
- from typing import Any, Dict, List, Optional
18
-
19
- from openai import OpenAI
20
-
21
- from client import PythonEnv
22
- from models import PythonReviewAction, ReviewFinding
23
-
24
-
25
- # Read all runtime configuration from environment variables so the script can
26
- # be reused unchanged across local runs, CI, and HF Spaces validation.
27
- API_BASE_URL = os.environ["API_BASE_URL"]
28
- MODEL_NAME = os.environ["MODEL_NAME"]
29
- API_KEY = os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY")
30
- ENV_BASE_URL = os.getenv("ENV_BASE_URL")
31
- DOCKER_IMAGE = os.getenv("PYTHON_ENV_IMAGE", "python_env-env:latest")
32
- MAX_STEPS = int(os.getenv("MAX_STEPS", "3"))
33
- MAX_TASKS = int(os.getenv("MAX_TASKS", "3"))
34
- REPORT_PATH = Path(os.getenv("INFERENCE_REPORT_PATH", "inference_results.json"))
35
- TEMPERATURE = float(os.getenv("TEMPERATURE", "0"))
36
- MAX_TOKENS = int(os.getenv("MAX_TOKENS", "900"))
37
-
38
- SYSTEM_PROMPT = """You are a precise Python code reviewer.
39
- Return strict JSON using this schema:
40
- {
41
- "findings": [
42
- {
43
- "title": "short title",
44
- "line": 1,
45
- "category": "bug|security|style|performance|maintainability",
46
- "severity": "critical|warning|info",
47
- "rationale": "why it matters",
48
- "recommendation": "how to fix it",
49
- "rule_id": "optional-stable-id"
50
- }
51
- ],
52
- "patched_code": null
53
- }
54
-
55
- Rules:
56
- - Output JSON only. No markdown fences.
57
- - Only report issues supported by the visible code.
58
- - Prefer high precision over quantity.
59
- - Include line numbers when possible.
60
- """
61
-
62
-
63
- def _build_prompt(observation, step: int, history: List[str]) -> str:
64
- """Build the task prompt sent to the model for one step."""
65
-
66
- history_text = "\n".join(history[-4:]) if history else "No previous attempts."
67
- return (
68
- f"Task ID: {observation.task.task_id}\n"
69
- f"Difficulty: {observation.task.difficulty}\n"
70
- f"Objective: {observation.task.objective}\n"
71
- f"Step: {step}\n"
72
- f"Attempts remaining: {observation.attempts_remaining}\n"
73
- f"Current score: {observation.score:.2f}\n"
74
- f"Latest feedback: {observation.feedback or 'None'}\n"
75
- f"Attempt history:\n{history_text}\n\n"
76
- "Code to review:\n"
77
- "```python\n"
78
- f"{observation.task.code}\n"
79
- "```"
80
- )
81
-
82
-
83
- def _extract_text_content(message_content: Any) -> str:
84
- """Normalize OpenAI response content into one text string."""
85
-
86
- if isinstance(message_content, str):
87
- return message_content
88
- if isinstance(message_content, list):
89
- parts: List[str] = []
90
- for item in message_content:
91
- if isinstance(item, dict):
92
- text = item.get("text")
93
- if isinstance(text, str):
94
- parts.append(text)
95
- return "\n".join(parts)
96
- return ""
97
-
98
-
99
- def _extract_json_blob(content: str) -> str:
100
- """Extract a JSON object from plain or fenced model output."""
101
-
102
- fenced_match = re.search(r"```(?:json)?\s*(\{.*\})\s*```", content, re.DOTALL)
103
- if fenced_match:
104
- return fenced_match.group(1)
105
-
106
- start = content.find("{")
107
- end = content.rfind("}")
108
- if start != -1 and end != -1 and end > start:
109
- return content[start : end + 1]
110
- return content
111
-
112
-
113
- def _parse_response(content: str) -> Dict[str, Any]:
114
- """Parse the model response into a normalized payload dict."""
115
-
116
- raw = _extract_json_blob(content)
117
- try:
118
- data = json.loads(raw)
119
- except json.JSONDecodeError:
120
- return {"findings": [], "patched_code": None, "_parse_error": raw}
121
-
122
- findings = data.get("findings", [])
123
- if not isinstance(findings, list):
124
- findings = []
125
- patched_code = data.get("patched_code")
126
- if patched_code is not None and not isinstance(patched_code, str):
127
- patched_code = None
128
- return {"findings": findings, "patched_code": patched_code}
129
-
130
-
131
- def _completion(client: OpenAI, prompt: str) -> Dict[str, Any]:
132
- """Send one completion request to the configured model endpoint."""
133
-
134
- response = client.chat.completions.create(
135
- model=MODEL_NAME,
136
- temperature=TEMPERATURE,
137
- max_tokens=MAX_TOKENS,
138
- messages=[
139
- {"role": "system", "content": SYSTEM_PROMPT},
140
- {"role": "user", "content": prompt},
141
- ],
142
- )
143
- content = _extract_text_content(response.choices[0].message.content) or "{}"
144
- return _parse_response(content)
145
-
146
-
147
- def _normalize_findings(payload: Dict[str, Any]) -> List[ReviewFinding]:
148
- """Convert raw dict findings into validated `ReviewFinding` objects."""
149
-
150
- findings: List[ReviewFinding] = []
151
- for item in payload.get("findings", []):
152
- if not isinstance(item, dict):
153
- continue
154
- try:
155
- findings.append(ReviewFinding(**item))
156
- except Exception:
157
- continue
158
- return findings
159
-
160
-
161
- def _build_fallback_action(observation, note: str) -> PythonReviewAction:
162
- """Create a safe fallback action when model output is unusable."""
163
-
164
- return PythonReviewAction(
165
- operation="finalize" if observation.attempts_remaining <= 1 else "request_hint",
166
- note=note,
167
- )
168
-
169
-
170
- def _to_action(
171
- payload: Dict[str, Any],
172
- observation,
173
- finalize: bool,
174
- ) -> PythonReviewAction:
175
- """Convert a parsed model payload into a valid environment action."""
176
-
177
- findings = _normalize_findings(payload)
178
- if not findings and not payload.get("patched_code"):
179
- note = "Model returned no valid findings."
180
- if payload.get("_parse_error"):
181
- note = f"{note} Raw response could not be parsed as JSON."
182
- return _build_fallback_action(observation, note)
183
-
184
- return PythonReviewAction(
185
- operation="finalize" if finalize else "submit_findings",
186
- findings=findings,
187
- patched_code=payload.get("patched_code"),
188
- )
189
-
190
-
191
- def _make_env() -> PythonEnv:
192
- """Connect to a live environment or launch the Docker image."""
193
-
194
- if ENV_BASE_URL:
195
- return PythonEnv(base_url=ENV_BASE_URL)
196
- return PythonEnv.from_docker_image(DOCKER_IMAGE)
197
-
198
-
199
- def _task_result_dict(observation, step_logs: List[Dict[str, Any]]) -> Dict[str, Any]:
200
- """Build the report payload for one completed task run."""
201
-
202
- evaluation = observation.evaluation
203
- return {
204
- "task_id": observation.task.task_id,
205
- "difficulty": observation.task.difficulty,
206
- "title": observation.task.title,
207
- "score": observation.score,
208
- "passed": evaluation.passed,
209
- "matched_findings": evaluation.matched_findings,
210
- "total_findings": evaluation.total_findings,
211
- "false_positives": evaluation.false_positives,
212
- "duplicate_findings": evaluation.duplicate_findings,
213
- "weighted_recall": evaluation.weighted_recall,
214
- "patch_score": evaluation.patch_score,
215
- "steps": step_logs,
216
- }
217
-
218
-
219
- def main() -> None:
220
- """Run the configured model against the benchmark task set."""
221
-
222
- if not API_KEY:
223
- raise RuntimeError("Set HF_TOKEN or OPENAI_API_KEY before running inference.py")
224
-
225
- client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
226
- env = _make_env()
227
- episode_results: List[Dict[str, Any]] = []
228
-
229
- try:
230
- for index in range(MAX_TASKS):
231
- result = env.reset()
232
- observation = result.observation
233
- history: List[str] = []
234
- step_logs: List[Dict[str, Any]] = []
235
-
236
- print(
237
- f"Task {index + 1}: {observation.task.task_id} "
238
- f"({observation.task.difficulty})"
239
- )
240
-
241
- for step in range(1, MAX_STEPS + 1):
242
- prompt = _build_prompt(observation, step, history)
243
- try:
244
- # Model-call failures are captured in the report rather than
245
- # crashing the full benchmark run.
246
- payload = _completion(client, prompt)
247
- except Exception as exc:
248
- payload = {"findings": [], "patched_code": None, "_error": str(exc)}
249
-
250
- action = _to_action(
251
- payload=payload,
252
- observation=observation,
253
- finalize=step == MAX_STEPS or observation.attempts_remaining <= 1,
254
- )
255
-
256
- result = env.step(action)
257
- observation = result.observation
258
-
259
- step_log = {
260
- "step": step,
261
- "operation": action.operation,
262
- "submitted_findings": len(action.findings),
263
- "reward": result.reward or 0.0,
264
- "score": observation.score,
265
- "done": result.done,
266
- "feedback": observation.feedback,
267
- }
268
- if payload.get("_error"):
269
- step_log["model_error"] = payload["_error"]
270
- if payload.get("_parse_error"):
271
- step_log["parse_error"] = True
272
- step_logs.append(step_log)
273
-
274
- # The history string is fed back into later prompts so the
275
- # model can see what it already tried.
276
- history.append(
277
- f"step={step} op={action.operation} findings={len(action.findings)} "
278
- f"score={observation.score:.2f} feedback={observation.feedback}"
279
- )
280
-
281
- print(
282
- f" step={step} op={action.operation} findings={len(action.findings)} "
283
- f"score={observation.score:.2f} reward={(result.reward or 0.0):.2f} "
284
- f"done={result.done}"
285
- )
286
-
287
- if result.done:
288
- break
289
-
290
- episode_results.append(_task_result_dict(observation, step_logs))
291
- finally:
292
- env.close()
293
-
294
- mean_score = (
295
- sum(item["score"] for item in episode_results) / len(episode_results)
296
- if episode_results
297
- else 0.0
298
- )
299
- summary = {
300
- "model_name": MODEL_NAME,
301
- "api_base_url": API_BASE_URL,
302
- "task_count": len(episode_results),
303
- "mean_score": mean_score,
304
- "results": episode_results,
305
- }
306
-
307
- # Persist the report so scores can be compared across runs and models.
308
- REPORT_PATH.write_text(json.dumps(summary, indent=2), encoding="utf-8")
309
- print(json.dumps(summary, indent=2))
310
- print(f"\nSaved report to {REPORT_PATH}")
311
-
312
-
313
- if __name__ == "__main__":
314
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Fail-safe inference entrypoint for the Python code review environment."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import io
7
+ import json
8
+ import os
9
+ import subprocess
10
+ import sys
11
+ import time
12
+ from collections.abc import Iterable
13
+ from contextlib import redirect_stderr, redirect_stdout
14
+ from typing import Any, Dict, Optional
15
+
16
+ from compat import install_openenv_fastmcp_compat
17
+
18
+ try:
19
+ from openai import OpenAI
20
+ except Exception:
21
+ OpenAI = None # type: ignore[assignment]
22
+
23
+
24
+ install_openenv_fastmcp_compat()
25
+
26
+ try:
27
+ from server.env import PythonCodeReviewEnvironment
28
+ except Exception:
29
+ PythonCodeReviewEnvironment = None # type: ignore[assignment]
30
+
31
+ try:
32
+ from models import PythonCodeReviewAction
33
+ except Exception:
34
+ PythonCodeReviewAction = None # type: ignore[assignment]
35
+
36
+ try:
37
+ from tasks import task_ids
38
+ except Exception:
39
+ task_ids = None # type: ignore[assignment]
40
+
41
+
42
+ ALLOWED_ACTIONS = {
43
+ "analyze_code",
44
+ "edit_code",
45
+ "run_tests",
46
+ "submit_solution",
47
+ }
48
+ DEFAULT_MODEL_NAME = "mock-model"
49
+ DEFAULT_ACTION = {"action_type": "analyze_code", "code": None, "fallback_reason": "mock_response"}
50
+ API_TIMEOUT_SECONDS = 3.0
51
+ API_RETRIES = 1
52
+ API_RETRY_DELAY_SECONDS = 0.2
53
+ MAX_STEPS = 2
54
+
55
+
56
+ def safe_env(name: str, default: str = "") -> str:
57
+ """Read an allowed environment variable and return a safe string default."""
58
+ try:
59
+ value = os.getenv(name)
60
+ if value is None:
61
+ return default
62
+ return str(value)
63
+ except Exception:
64
+ return default
65
+
66
+
67
+ def clamp(value: float, low: float = 0.0, high: float = 1.0) -> float:
68
+ """Clamp a numeric value to a bounded range."""
69
+ try:
70
+ return max(low, min(high, float(value)))
71
+ except Exception:
72
+ return low
73
+
74
+
75
+ def safe_float(value: Any, default: float = 0.0) -> float:
76
+ """Convert a value to float without raising."""
77
+ try:
78
+ return float(value)
79
+ except Exception:
80
+ return default
81
+
82
+
83
+ def safe_text(value: Any, default: str = "") -> str:
84
+ """Convert any value into a bounded, printable string."""
85
+ try:
86
+ text = str(value)
87
+ except Exception:
88
+ return default
89
+ text = " ".join(text.split())
90
+ return text[:160] if text else default
91
+
92
+
93
+ def safe_getattr(obj: Any, name: str, default: Any = None) -> Any:
94
+ """Fetch an attribute from an object without raising."""
95
+ try:
96
+ return getattr(obj, name, default)
97
+ except Exception:
98
+ return default
99
+
100
+
101
+ def parse_json_response(raw_text: str) -> Dict[str, Any]:
102
+ """Parse model output into a safe action payload with deterministic fallback."""
103
+ try:
104
+ text = raw_text or ""
105
+ start = text.find("{")
106
+ end = text.rfind("}") + 1
107
+ if start >= 0 and end > start:
108
+ payload = json.loads(text[start:end])
109
+ if isinstance(payload, dict):
110
+ action_type = payload.get("action_type", DEFAULT_ACTION["action_type"])
111
+ code = payload.get("code")
112
+ if action_type not in ALLOWED_ACTIONS:
113
+ action_type = DEFAULT_ACTION["action_type"]
114
+ if action_type != "edit_code":
115
+ code = None
116
+ return {
117
+ "action_type": action_type,
118
+ "code": code,
119
+ "fallback_reason": "",
120
+ }
121
+ except Exception:
122
+ pass
123
+ return dict(DEFAULT_ACTION)
124
+
125
+
126
+ def build_prompt(observation: Any) -> str:
127
+ """Build a short prompt from the current observation with safe defaults."""
128
+ try:
129
+ task_description = safe_text(safe_getattr(observation, "task_description", ""), "No task description.")
130
+ current_code = safe_text(safe_getattr(observation, "current_code", ""), "")
131
+ errors = safe_text(safe_getattr(observation, "errors", ""), "")
132
+ tests = safe_text(safe_getattr(observation, "test_results", ""), "")
133
+ score = clamp(safe_getattr(observation, "score", 0.0))
134
+ visible_tests = safe_getattr(observation, "visible_tests", [])
135
+ if not isinstance(visible_tests, Iterable) or isinstance(visible_tests, (str, bytes)):
136
+ visible_tests = []
137
+ visible_lines = []
138
+ for item in list(visible_tests)[:4]:
139
+ visible_lines.append(f"- {safe_text(item, 'unknown test')}")
140
+ visible_block = "\n".join(visible_lines) if visible_lines else "- none"
141
+ return (
142
+ "Return exactly one JSON object with keys action_type and optional code.\n"
143
+ "Allowed action_type values: analyze_code, edit_code, run_tests, submit_solution.\n"
144
+ f"Task: {task_description}\n"
145
+ f"Score: {score:.3f}\n"
146
+ f"Errors: {errors or 'none'}\n"
147
+ f"Tests: {tests or 'not available'}\n"
148
+ f"Visible tests:\n{visible_block}\n"
149
+ f"Code:\n{current_code}\n"
150
+ )
151
+ except Exception:
152
+ return (
153
+ "Return exactly one JSON object with keys action_type and optional code. "
154
+ "Use action_type analyze_code."
155
+ )
156
+
157
+
158
+ def create_client() -> Optional[Any]:
159
+ """Create an OpenAI-compatible client using only the allowed environment variables."""
160
+ if OpenAI is None:
161
+ return None
162
+ base_url = safe_env("API_BASE_URL", "")
163
+ if not base_url:
164
+ return None
165
+ try:
166
+ if safe_env("HF_TOKEN", ""):
167
+ os.environ["OPENAI_API_KEY"] = safe_env("HF_TOKEN", "")
168
+ except Exception:
169
+ pass
170
+ try:
171
+ client = OpenAI(base_url=os.getenv("API_BASE_URL"))
172
+ return client
173
+ except Exception:
174
+ return None
175
+
176
+
177
+ def run_llm(client: Optional[Any], model: str, prompt: str) -> Dict[str, Any]:
178
+ """Call the LLM with timeout and retry, then fall back to a mock action."""
179
+ if client is None:
180
+ fallback = dict(DEFAULT_ACTION)
181
+ fallback["fallback_reason"] = "client_unavailable"
182
+ return fallback
183
+
184
+ last_reason = "llm_unavailable"
185
+ for attempt in range(API_RETRIES + 1):
186
+ try:
187
+ with redirect_stdout(io.StringIO()), redirect_stderr(io.StringIO()):
188
+ response = client.with_options(timeout=API_TIMEOUT_SECONDS).chat.completions.create(
189
+ model=model,
190
+ messages=[{"role": "user", "content": prompt}],
191
+ temperature=0,
192
+ max_tokens=300,
193
+ )
194
+ message = safe_getattr(response.choices[0].message, "content", "")
195
+ parsed = parse_json_response(message)
196
+ if parsed.get("fallback_reason"):
197
+ parsed["fallback_reason"] = "parse_failed"
198
+ return parsed
199
+ except Exception as exc:
200
+ last_reason = safe_text(exc, "llm_error").lower().replace(" ", "_")
201
+ if attempt < API_RETRIES:
202
+ try:
203
+ time.sleep(API_RETRY_DELAY_SECONDS * (attempt + 1))
204
+ except Exception:
205
+ pass
206
+
207
+ fallback = dict(DEFAULT_ACTION)
208
+ fallback["fallback_reason"] = last_reason[:48] or "llm_retry_exhausted"
209
+ return fallback
210
+
211
+
212
+ def probe_docker(image_name: str) -> Dict[str, Any]:
213
+ """Safely validate Docker connectivity when a local image name is provided."""
214
+ if not image_name:
215
+ return {"checked": False, "available": False, "reason": "docker_skip"}
216
+ try:
217
+ with redirect_stdout(io.StringIO()), redirect_stderr(io.StringIO()):
218
+ result = subprocess.run(
219
+ ["docker", "image", "inspect", image_name],
220
+ capture_output=True,
221
+ text=True,
222
+ timeout=3,
223
+ check=False,
224
+ )
225
+ if result.returncode == 0:
226
+ return {"checked": True, "available": True, "reason": "docker_ok"}
227
+ return {"checked": True, "available": False, "reason": "docker_unreachable"}
228
+ except Exception as exc:
229
+ return {"checked": True, "available": False, "reason": safe_text(exc, "docker_error").lower().replace(" ", "_")}
230
+
231
+
232
+ def fallback_step_result(reason: str, docker_status: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
233
+ """Return a deterministic dummy step result when environment execution fails."""
234
+ docker_reason = safe_text((docker_status or {}).get("reason", "docker_skip"), "docker_skip")
235
+ short_reason = safe_text(reason, "env_fallback").lower().replace(" ", "_")
236
+ return {
237
+ "status": "ok",
238
+ "fallback": True,
239
+ "reason": short_reason[:64],
240
+ "reward": 0.0,
241
+ "improvement": 0.0,
242
+ "score": 0.0,
243
+ "done": True,
244
+ "docker": docker_reason[:32],
245
+ }
246
+
247
+
248
+ def safe_task_list() -> list[str]:
249
+ """Load task identifiers without raising."""
250
+ try:
251
+ if callable(task_ids):
252
+ loaded = list(task_ids())
253
+ if loaded:
254
+ return [safe_text(item, "fallback-task") for item in loaded]
255
+ except Exception:
256
+ pass
257
+ return ["fallback-task"]
258
+
259
+
260
+ def make_action(action_payload: Dict[str, Any]) -> Any:
261
+ """Build a validated environment action or a safe placeholder."""
262
+ action_type = action_payload.get("action_type", DEFAULT_ACTION["action_type"])
263
+ if action_type not in ALLOWED_ACTIONS:
264
+ action_type = DEFAULT_ACTION["action_type"]
265
+ code = action_payload.get("code")
266
+ if action_type != "edit_code":
267
+ code = None
268
+ if PythonCodeReviewAction is None:
269
+ return {"action_type": action_type, "code": code}
270
+ try:
271
+ return PythonCodeReviewAction(action_type=action_type, code=code)
272
+ except Exception:
273
+ try:
274
+ return PythonCodeReviewAction(action_type=DEFAULT_ACTION["action_type"], code=None)
275
+ except Exception:
276
+ return {"action_type": DEFAULT_ACTION["action_type"], "code": None}
277
+
278
+
279
+ def compute_reward(
280
+ previous_score: float,
281
+ current_score: float,
282
+ step_reward: float,
283
+ used_fallback: bool,
284
+ done: bool,
285
+ ) -> Dict[str, float]:
286
+ """Compute a deterministic dynamic reward and improvement metric."""
287
+ prev_value = clamp(previous_score)
288
+ curr_value = clamp(current_score)
289
+ improvement = round(curr_value - prev_value, 4)
290
+ bounded_step_reward = max(-1.0, min(1.0, safe_float(step_reward, 0.0)))
291
+ reward_value = (
292
+ 0.55 * curr_value
293
+ + 0.30 * max(improvement, 0.0)
294
+ + 0.10 * max(bounded_step_reward, 0.0)
295
+ + (0.05 if done and curr_value >= 0.99 else 0.0)
296
+ - (0.05 if used_fallback else 0.0)
297
+ )
298
+ return {
299
+ "reward": round(clamp(reward_value), 4),
300
+ "improvement": improvement,
301
+ }
302
+
303
+
304
+ def safe_step(env: Any, action: Any) -> Any:
305
+ """Execute one environment step without allowing stdout leaks or exceptions."""
306
+ try:
307
+ with redirect_stdout(io.StringIO()), redirect_stderr(io.StringIO()):
308
+ return env.step(action)
309
+ except Exception:
310
+ return None
311
+
312
+
313
+ def safe_reset(env: Any, task_id: str) -> Any:
314
+ """Reset the environment safely for a task."""
315
+ try:
316
+ with redirect_stdout(io.StringIO()), redirect_stderr(io.StringIO()):
317
+ return env.reset(task_id=task_id)
318
+ except Exception:
319
+ return None
320
+
321
+
322
+ def run_env(client: Optional[Any], model: str) -> Dict[str, Any]:
323
+ """Run the environment loop safely and return a structured result payload."""
324
+ docker_status = probe_docker(safe_env("LOCAL_IMAGE_NAME", ""))
325
+ if PythonCodeReviewEnvironment is None:
326
+ return fallback_step_result("env_import_failed", docker_status)
327
+
328
+ try:
329
+ with redirect_stdout(io.StringIO()), redirect_stderr(io.StringIO()):
330
+ env = PythonCodeReviewEnvironment(verbose=False)
331
+ except Exception as exc:
332
+ return fallback_step_result(f"env_init_failed_{safe_text(exc, 'unknown')}", docker_status)
333
+
334
+ tasks = safe_task_list()
335
+ task_id = tasks[0] if tasks else "fallback-task"
336
+ observation = safe_reset(env, task_id)
337
+ if observation is None:
338
+ return fallback_step_result("env_reset_failed", docker_status)
339
+
340
+ previous_score = clamp(safe_getattr(observation, "score", 0.0))
341
+ total_step_reward = 0.0
342
+ used_fallback = False
343
+ final_status = "ok"
344
+ final_reason = "completed"
345
+ final_observation = observation
346
+
347
+ for step_index in range(MAX_STEPS):
348
+ prompt = build_prompt(final_observation)
349
+ action_payload = run_llm(client, model, prompt)
350
+ used_fallback = used_fallback or bool(action_payload.get("fallback_reason"))
351
+ action = make_action(action_payload)
352
+ next_observation = safe_step(env, action)
353
+ if next_observation is None:
354
+ final_status = "ok"
355
+ final_reason = "env_step_fallback"
356
+ used_fallback = True
357
+ break
358
+
359
+ final_observation = next_observation
360
+ total_step_reward += safe_float(safe_getattr(final_observation, "reward", 0.0), 0.0)
361
+ done = bool(safe_getattr(final_observation, "done", False))
362
+ score = clamp(safe_getattr(final_observation, "score", 0.0))
363
+ if safe_getattr(final_observation, "last_action_status", ""):
364
+ final_reason = safe_text(safe_getattr(final_observation, "last_action_status", ""), "step_completed")
365
+ elif action_payload.get("fallback_reason"):
366
+ final_reason = safe_text(action_payload.get("fallback_reason"), "llm_fallback")
367
+ else:
368
+ final_reason = f"step_{step_index + 1}_completed"
369
+ if done:
370
+ break
371
+
372
+ if step_index == 0:
373
+ submit_action = make_action({"action_type": "submit_solution", "code": None})
374
+ submitted_observation = safe_step(env, submit_action)
375
+ if submitted_observation is None:
376
+ final_reason = "submit_fallback"
377
+ used_fallback = True
378
+ break
379
+ final_observation = submitted_observation
380
+ total_step_reward += safe_float(safe_getattr(final_observation, "reward", 0.0), 0.0)
381
+ if safe_getattr(final_observation, "last_action_status", ""):
382
+ final_reason = safe_text(safe_getattr(final_observation, "last_action_status", ""), "submit_completed")
383
+ break
384
+
385
+ current_score = clamp(safe_getattr(final_observation, "score", previous_score))
386
+ done = bool(safe_getattr(final_observation, "done", True))
387
+ metrics = compute_reward(
388
+ previous_score=previous_score,
389
+ current_score=current_score,
390
+ step_reward=total_step_reward,
391
+ used_fallback=used_fallback,
392
+ done=done,
393
+ )
394
+ return {
395
+ "status": final_status,
396
+ "fallback": used_fallback,
397
+ "reason": safe_text(final_reason, "completed").lower().replace(" ", "_")[:64],
398
+ "reward": metrics["reward"],
399
+ "improvement": metrics["improvement"],
400
+ "score": round(current_score, 4),
401
+ "done": done,
402
+ "docker": safe_text(docker_status.get("reason", "docker_skip"), "docker_skip")[:32],
403
+ }
404
+
405
+
406
+ def format_step_message(result: Dict[str, Any]) -> str:
407
+ """Format the only allowed STEP line for stdout."""
408
+ try:
409
+ fallback = bool(result.get("fallback", False))
410
+ reason = safe_text(result.get("reason", "completed"), "completed").lower().replace(" ", "_")
411
+ if fallback:
412
+ reward = safe_float(result.get("reward", 0.0), 0.0)
413
+ improvement = safe_float(result.get("improvement", 0.0), 0.0)
414
+ score = safe_float(result.get("score", 0.0), 0.0)
415
+ status = safe_text(result.get("status", "ok"), "ok").lower().replace(" ", "_")
416
+ return (
417
+ f"error handled: {reason} reward={reward:.4f} status={status} "
418
+ f"fallback=true improvement={improvement:.4f} score={score:.4f}"
419
+ )
420
+ reward = safe_float(result.get("reward", 0.0), 0.0)
421
+ improvement = safe_float(result.get("improvement", 0.0), 0.0)
422
+ score = safe_float(result.get("score", 0.0), 0.0)
423
+ status = safe_text(result.get("status", "ok"), "ok").lower().replace(" ", "_")
424
+ return (
425
+ f"reward={reward:.4f} status={status} "
426
+ f"fallback=false improvement={improvement:.4f} score={score:.4f}"
427
+ )
428
+ except Exception:
429
+ return "error handled: formatting_failed"
430
+
431
+
432
+ def main() -> int:
433
+ """Run the inference workflow and always terminate successfully."""
434
+ step_message = "error handled: initialization_failed"
435
+ try:
436
+ model_name = safe_env("MODEL_NAME", DEFAULT_MODEL_NAME) or DEFAULT_MODEL_NAME
437
+ client = create_client()
438
+ result = run_env(client, model_name)
439
+ step_message = format_step_message(result)
440
+ except BaseException as exc:
441
+ step_message = f"error handled: {safe_text(exc, 'unexpected_failure').lower().replace(' ', '_')[:64]}"
442
+ finally:
443
+ try:
444
+ print("START")
445
+ print(f"STEP: {step_message}")
446
+ print("END")
447
+ except Exception:
448
+ pass
449
+ return 0
450
+
451
+
452
+ if __name__ == "__main__":
453
+ try:
454
+ main()
455
+ except BaseException:
456
+ try:
457
+ print("START")
458
+ print("STEP: error handled: fatal_guard")
459
+ print("END")
460
+ except Exception:
461
+ pass
462
+ sys.exit(0)
models.py CHANGED
@@ -1,217 +1,185 @@
1
- """Typed models for the Python code-review environment.
2
 
3
- This module is the shared contract between:
4
 
5
- - the OpenEnv server implementation
6
- - the REST API layer
7
- - the benchmark grader
8
- - the inference script
9
- - the tests
10
-
11
- Keeping these models centralized makes the environment easier to validate,
12
- serialize, and evolve without each module inventing its own payload shape.
13
- """
14
-
15
- from typing import List, Literal, Optional
16
 
17
  from pydantic import BaseModel, Field
18
- from openenv.core.env_server.types import Action, Observation
19
 
 
20
 
21
- # Difficulty buckets are intentionally small and fixed so tasks can be
22
- # grouped for curriculum learning and reporting without extra normalization.
23
- Difficulty = Literal["easy", "medium", "hard"]
24
 
25
- # Severity is separate from category because one category such as "security"
26
- # can still vary in importance across tasks.
 
 
27
  Severity = Literal["critical", "warning", "info"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
- # Categories help both humans and agents understand what type of issue was found.
30
- Category = Literal["bug", "security", "style", "performance", "maintainability"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
- # Operations define the small action space an agent can use during an episode.
33
- Operation = Literal["submit_findings", "request_hint", "finalize"]
 
 
34
 
35
 
36
  class ReviewFinding(BaseModel):
37
- """A structured review finding.
38
-
39
- Each finding is designed to be machine-gradable while still resembling the
40
- sort of issue summary a human reviewer would write in a real code review.
41
- """
42
-
43
- title: str = Field(..., description="Short title for the finding")
44
- line: Optional[int] = Field(default=None, description="1-based source line number")
45
- category: Category = Field(default="bug", description="Issue category")
46
- severity: Severity = Field(default="warning", description="Issue severity")
47
- rationale: str = Field(
48
- default="",
49
- description="Why the issue matters and how it affects behaviour or safety",
50
- )
51
- recommendation: Optional[str] = Field(
52
- default=None, description="Concrete fix recommendation"
53
- )
54
- rule_id: Optional[str] = Field(
55
- default=None,
56
- description="Stable internal rule identifier when known",
57
- )
58
 
 
 
 
 
 
 
 
 
59
 
60
- class TaskDescriptor(BaseModel):
61
- """Public task metadata shown to the agent.
 
 
62
 
63
- This is intentionally the "visible" task information. Hidden grading
64
- details stay inside the server task bank so the benchmark remains useful.
65
- """
66
-
67
- task_id: str = Field(..., description="Stable task identifier")
68
- difficulty: Difficulty = Field(..., description="Task difficulty bucket")
69
- title: str = Field(..., description="Short task title")
70
- objective: str = Field(..., description="What the reviewer should accomplish")
71
- code: str = Field(..., description="Python code to review")
72
- max_steps: int = Field(..., ge=1, description="Maximum actions allowed")
73
- success_threshold: float = Field(
74
- ..., ge=0.0, le=1.0, description="Minimum score considered a pass"
75
- )
76
-
77
-
78
- class TaskEvaluation(BaseModel):
79
- """Deterministic grader output.
80
-
81
- This model is returned in observations and offline grading routes so that
82
- both online interaction and offline evaluation use exactly the same metrics.
83
- """
84
-
85
- matched_reference_ids: List[str] = Field(default_factory=list)
86
- matched_findings: int = Field(default=0, ge=0)
87
- total_findings: int = Field(default=0, ge=0)
88
- false_positives: int = Field(default=0, ge=0)
89
- duplicate_findings: int = Field(default=0, ge=0)
90
- weighted_recall: float = Field(default=0.0, ge=0.0, le=1.0)
91
- patch_score: float = Field(default=0.0, ge=0.0, le=1.0)
92
- score: float = Field(default=0.0, ge=0.0, le=1.0)
93
- passed: bool = Field(default=False)
94
-
95
-
96
- class PythonReviewAction(Action):
97
- """Action submitted by an agent during an episode.
98
-
99
- The action space is kept intentionally small:
100
-
101
- - `submit_findings` for intermediate progress
102
- - `request_hint` when the agent needs guidance at a small penalty
103
- - `finalize` when the agent wants the episode to end
104
- """
105
-
106
- operation: Operation = Field(
107
- default="submit_findings",
108
- description="How to interact with the environment on this step",
109
- )
110
- findings: List[ReviewFinding] = Field(
111
- default_factory=list,
112
- description="Structured findings being submitted for grading",
113
- )
114
- patched_code: Optional[str] = Field(
115
- default=None,
116
- description="Optional improved version of the code under review",
117
- )
118
- note: Optional[str] = Field(
119
- default=None,
120
- description="Optional free-form reviewer note for logging or context",
121
- )
122
-
123
-
124
- class PythonEnvConfig(BaseModel):
125
- """Environment-level configuration knobs.
126
-
127
- These values are useful for experimentation because they let you adjust
128
- reward shaping and curriculum ordering without changing the grader logic.
129
- """
130
-
131
- task_order: List[str] = Field(
132
- default_factory=lambda: ["py-review-easy", "py-review-medium", "py-review-hard"],
133
- description="Deterministic task order used across resets",
134
- )
135
- max_steps_per_task: int = Field(default=4, ge=1, le=10)
136
- hint_penalty: float = Field(default=0.05, ge=0.0, le=1.0)
137
- false_positive_penalty: float = Field(default=0.08, ge=0.0, le=1.0)
138
- duplicate_penalty: float = Field(default=0.03, ge=0.0, le=1.0)
139
- patch_bonus_multiplier: float = Field(default=0.2, ge=0.0, le=1.0)
140
- max_history_entries: int = Field(default=50, ge=1, le=500)
141
-
142
-
143
- class PythonReviewObservation(Observation):
144
- """Observation returned by `reset()` and `step()`.
145
-
146
- The observation combines:
147
-
148
- - visible task context
149
- - immediate feedback on the previous action
150
- - cumulative evaluation state
151
- - OpenEnv-standard reward/done/metadata fields
152
- """
153
-
154
- task: TaskDescriptor = Field(..., description="Current task details")
155
- instructions: str = Field(
156
- default="Inspect the code and submit structured findings.",
157
- description="Episode instructions shown to the agent",
158
- )
159
- feedback: str = Field(default="", description="Feedback for the last action")
160
- submitted_findings: List[ReviewFinding] = Field(
161
- default_factory=list,
162
- description="All findings submitted so far in this episode",
163
- )
164
- hints_used: int = Field(default=0, ge=0)
165
- attempts_remaining: int = Field(default=0, ge=0)
166
- evaluation: TaskEvaluation = Field(default_factory=TaskEvaluation)
167
- score: float = Field(
168
- default=0.0,
169
- ge=0.0,
170
- le=1.0,
171
- description="Current task score after this step",
172
- )
173
- review_time_ms: float = Field(default=0.0, ge=0.0)
174
-
175
-
176
- class EpisodeRecord(BaseModel):
177
- """Stored summary of a completed or in-progress episode.
178
-
179
- This model is used by the custom history routes and is intentionally
180
- compact enough to archive for later analysis or dataset creation.
181
- """
182
-
183
- episode_id: str
184
- task_id: str
185
- difficulty: Difficulty
186
- title: str
187
- final_score: float = Field(ge=0.0, le=1.0)
188
- passed: bool = Field(default=False)
189
- steps_taken: int = Field(default=0, ge=0)
190
- hints_used: int = Field(default=0, ge=0)
191
- matched_findings: int = Field(default=0, ge=0)
192
- total_findings: int = Field(default=0, ge=0)
193
- false_positives: int = Field(default=0, ge=0)
194
- duplicate_findings: int = Field(default=0, ge=0)
195
- status: Literal["active", "completed"] = Field(default="completed")
196
- created_at: str
197
- updated_at: str
198
-
199
-
200
- class DirectReviewRequest(BaseModel):
201
- """Request model for ad-hoc review outside the benchmark tasks."""
202
-
203
- code: str = Field(..., description="Python source code to inspect")
204
- context: Optional[str] = Field(
205
- default=None, description="Optional explanation of the code's purpose"
206
- )
207
 
208
 
209
  class DirectReviewResponse(BaseModel):
210
- """Static review result for arbitrary Python code.
211
-
212
- This route is useful for manual testing and dataset generation because it
213
- lets you review arbitrary snippets without entering the benchmark loop.
214
- """
215
 
216
  issues: List[ReviewFinding] = Field(default_factory=list)
217
  summary: str = Field(default="")
@@ -219,30 +187,26 @@ class DirectReviewResponse(BaseModel):
219
  improved_code: Optional[str] = Field(default=None)
220
 
221
 
222
- class DeleteResponse(BaseModel):
223
- """Small acknowledgement payload for DELETE routes."""
224
-
225
- detail: str
226
-
227
 
228
- class HealthResponse(BaseModel):
229
- """Health payload used by Docker and Spaces checks.
230
-
231
- This payload stays intentionally simple because health checks are often
232
- consumed by infrastructure rather than by human users.
233
- """
234
-
235
- status: Literal["ok"] = "ok"
236
- environment: str = "python_env"
237
- task_count: int = Field(default=0, ge=0)
238
- active_task_id: Optional[str] = None
239
- active_episode_id: Optional[str] = None
240
-
241
-
242
- # Backward-compatible aliases keep older imports working while the project
243
- # standardizes on the `Python*` naming convention.
244
- PythonAction = PythonReviewAction
245
- PythonObservation = PythonReviewObservation
246
- CodeReviewAction = PythonReviewAction
247
- CodeReviewObservation = PythonReviewObservation
248
- CodeReviewConfig = PythonEnvConfig
 
1
+ """Typed models for Python code review and repair environment."""
2
 
3
+ from __future__ import annotations
4
 
5
+ from typing import Any, Dict, List, Literal, Optional
 
 
 
 
 
 
 
 
 
 
6
 
7
  from pydantic import BaseModel, Field
 
8
 
9
+ from compat import Action, Observation, State
10
 
 
 
 
11
 
12
+ Difficulty = Literal["easy", "medium", "hard"]
13
+ TaskKind = Literal["syntax_fix", "bug_fix", "optimization"]
14
+ ActionType = Literal["analyze_code", "edit_code", "run_tests", "submit_solution"]
15
+ Category = Literal["bug", "security", "performance", "maintainability", "style", "testing"]
16
  Severity = Literal["critical", "warning", "info"]
17
+
18
+
19
+ class HistoryEntry(BaseModel):
20
+ """Record of one action taken during an episode."""
21
+
22
+ step: int = Field(..., ge=0)
23
+ action_type: ActionType
24
+ status: str = Field(..., description="Outcome message")
25
+ reward: float = Field(...)
26
+
27
+
28
+ class RewardDetails(BaseModel):
29
+ """Detailed reward breakdown for transparent agent feedback.
30
+
31
+ The reward system is dynamic and multi-component, with 6 independent sources:
32
+
33
+ 1. Progress Reward (max +0.25)
34
+ - Awarded for score improvement from previous step
35
+ - Formula: min(PROGRESS_SCALE * score_delta, 0.25)
36
+ - Encourages continuous improvement
37
+
38
+ 2. Syntax Reward (max +0.35)
39
+ - One-time bonus for fixing syntax errors (first compile)
40
+ - Applied when code transitions from uncompilable to compilable
41
+ - Acknowledges the critical first step of valid code
42
+
43
+ 3. Test Reward (max +0.20)
44
+ - Based on improvement in test pass rate
45
+ - Formula: min(TEST_PASS_REWARD_SCALE * test_improvement, 0.20)
46
+ - Rewards incremental test progress
47
+
48
+ 4. Quality Reward (max +0.15)
49
+ - Based on AST-detected code quality metrics
50
+ - Rewards improvements in structure, readability, best practices
51
+ - Uses deterministic grader feedback
52
+
53
+ 5. Stagnation Penalty (−0.10)
54
+ - Applied when agent acts but code doesn't change
55
+ - Encourages editing rather than repeated analysis
56
+ - Configurable via STAGNATION_PENALTY constant
57
+
58
+ 6. Regression Penalty (scale −0.20)
59
+ - Applied when score decreases from previous step
60
+ - Formula: REGRESSION_PENALTY_SCALE * abs(score_delta)
61
+ - Discourages actions that make code worse
62
+
63
+ Final Reward: clamp(progress + syntax + test + quality - stagnation - regression, -1.0, +1.0)
64
+
65
+ The result is always bounded in [-1.0, +1.0], providing interpretable feedback for learning.
66
+ """
67
+
68
+ value: float = Field(..., description="Net scalar reward for this step (bounded in [-1.0, +1.0])")
69
+ syntax_reward: float = Field(default=0.0, description="Bonus for fixing syntax errors (max +0.35)")
70
+ test_reward: float = Field(default=0.0, description="Reward from test improvements (max +0.20)")
71
+ quality_bonus: float = Field(default=0.0, description="Bonus for code quality improvements (max +0.15)")
72
+ correctness_bonus: float = Field(default=0.0, description="Bonus for full correctness (max +0.50)")
73
+ progress_delta: float = Field(default=0.0, description="Reward from score improvement (max +0.25)")
74
+ stagnation_penalty: float = Field(default=0.0, description="Penalty for unchanged code (−0.10)")
75
+ regression_penalty: float = Field(default=0.0, description="Penalty for score decline (scale −0.20)")
76
+ invalid_action_penalty: float = Field(default=0.0, description="Penalty for invalid actions (−0.15)")
77
+ timeout_penalty: float = Field(default=0.0, description="Penalty for execution timeout (−0.15)")
78
+ reason: str = Field(..., description="Human-readable explanation of the reward")
79
+
80
+ # Debug information for transparency
81
+ prev_score: float = Field(default=0.0, description="Score before this step")
82
+ curr_score: float = Field(default=0.0, description="Score after this step")
83
+ code_changed: bool = Field(default=False, description="Whether the action modified the code")
84
+
85
+
86
+ class PythonCodeReviewAction(Action):
87
+ """Action space for code review environment."""
88
+
89
+ action_type: ActionType = Field(..., description="Type of action to perform")
90
+ code: Optional[str] = Field(default=None, description="New code for edit_code actions")
91
+
92
+
93
+ class PythonCodeReviewObservation(Observation):
94
+ """Observation returned by reset() and step()."""
95
+
96
+ task_id: str = Field(..., description="Current task identifier")
97
+ title: str = Field(default="", description="Human-readable task title")
98
+ difficulty: Difficulty = Field(..., description="Task difficulty level")
99
+ task_kind: Optional[TaskKind] = Field(default=None, description="Task type")
100
+ task_description: str = Field(..., description="Detailed task description")
101
+ current_code: str = Field(..., description="Current code state")
102
+ errors: str = Field(..., description="Syntax/compilation errors, if any")
103
+ test_results: str = Field(..., description="Results from test execution")
104
+ visible_tests: List[str] = Field(default_factory=list, description="Public test cases")
105
+ history: List[HistoryEntry] = Field(default_factory=list, description="Action history")
106
+ attempts_remaining: int = Field(..., ge=0, description="Actions left in episode")
107
+ last_action_status: str = Field(default="", description="Outcome message from the last action")
108
+ score: float = Field(..., ge=0.0, le=1.0, description="Current episode score")
109
+ reward_details: RewardDetails = Field(
110
+ default_factory=lambda: RewardDetails(value=0.0, reason="Reset"),
111
+ description="Detailed reward breakdown for the last action",
112
+ )
113
+
114
+
115
+ class PythonCodeReviewState(State):
116
+ """Exposed environment state."""
117
+
118
+ episode_id: str = Field(..., description="Unique episode identifier")
119
+ step_count: int = Field(default=0, ge=0)
120
+ task_id: Optional[str] = Field(default=None)
121
+ difficulty: Optional[Difficulty] = Field(default=None)
122
+ task_kind: Optional[TaskKind] = Field(default=None)
123
+ attempts_remaining: int = Field(default=0, ge=0)
124
+ current_code: str = Field(default="")
125
+ errors: str = Field(default="")
126
+ test_results: str = Field(default="")
127
+ history: List[HistoryEntry] = Field(default_factory=list)
128
+ score: float = Field(default=0.0, ge=0.0, le=1.0)
129
+ done: bool = Field(default=False)
130
+
131
+
132
+ class TaskDescriptor(BaseModel):
133
+ """Public task metadata."""
134
 
135
+ task_id: str = Field(..., description="Stable task identifier")
136
+ title: str = Field(..., description="Human-readable title")
137
+ difficulty: Difficulty = Field(..., description="Difficulty level")
138
+ task_kind: Optional[TaskKind] = Field(default=None, description="Type of task")
139
+ task_description: str = Field(default="", description="Full task description")
140
+ starter_code: str = Field(default="", description="Initial broken code")
141
+ visible_tests: List[str] = Field(default_factory=list, description="Public test cases")
142
+ goal: str = Field(default="", description="Optional goal summary for review-style tasks")
143
+ repo_summary: str = Field(default="", description="Optional repository context")
144
+ changed_files: List[str] = Field(default_factory=list, description="Changed files for review-style tasks")
145
+ available_files: List[str] = Field(default_factory=list, description="Browsable files for review-style tasks")
146
+ max_steps: int = Field(..., ge=1, description="Maximum steps allowed")
147
+
148
+
149
+ class TaskSummary(BaseModel):
150
+ """Lightweight task metadata for list endpoints."""
151
 
152
+ task_id: str = Field(..., description="Stable task identifier")
153
+ difficulty: Difficulty = Field(..., description="Difficulty level")
154
+ title: str = Field(..., description="Human-readable title")
155
+ goal: str = Field(default="", description="Optional task goal")
156
 
157
 
158
  class ReviewFinding(BaseModel):
159
+ """Structured code review finding used by auxiliary review utilities."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
 
161
+ title: str = Field(..., description="Short human-readable finding title")
162
+ file_path: str = Field(default="", description="Optional file path")
163
+ line: Optional[int] = Field(default=None, ge=1, description="Optional 1-based line number")
164
+ category: Category = Field(default="bug", description="Finding category")
165
+ severity: Severity = Field(default="warning", description="Finding severity")
166
+ rationale: str = Field(default="", description="Why this matters")
167
+ recommendation: str = Field(default="", description="Suggested remediation")
168
+ rule_id: str = Field(default="", description="Stable detector or rubric identifier")
169
 
170
+ @property
171
+ def explanation(self) -> str:
172
+ """Backward-compatible alias used by older grading helpers."""
173
+ return self.rationale
174
 
175
+ @property
176
+ def suggested_fix(self) -> str:
177
+ """Backward-compatible alias used by older grading helpers."""
178
+ return self.recommendation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
 
180
 
181
  class DirectReviewResponse(BaseModel):
182
+ """Response payload for deterministic direct-review utilities."""
 
 
 
 
183
 
184
  issues: List[ReviewFinding] = Field(default_factory=list)
185
  summary: str = Field(default="")
 
187
  improved_code: Optional[str] = Field(default=None)
188
 
189
 
190
+ class TaskGrade(BaseModel):
191
+ """Grading result for task submission."""
 
 
 
192
 
193
+ score: float = Field(..., ge=0.0, le=1.0, description="Overall score")
194
+ syntax_score: float = Field(default=0.0, ge=0.0, le=1.0)
195
+ tests_passed: int = Field(default=0, ge=0)
196
+ tests_total: int = Field(default=0, ge=0)
197
+ quality_score: float = Field(default=0.0, ge=0.0, le=1.0)
198
+ runtime_score: float = Field(default=0.0, ge=0.0, le=1.0)
199
+ timed_out: bool = Field(default=False)
200
+ matched_issue_ids: List[str] = Field(default_factory=list)
201
+ false_positives: int = Field(default=0, ge=0)
202
+ duplicate_findings: int = Field(default=0, ge=0)
203
+ matched_weight: float = Field(default=0.0, ge=0.0, le=1.0)
204
+ details: Dict[str, Any] = Field(default_factory=dict)
205
+
206
+
207
+ class HealthResponse(BaseModel):
208
+ """Health check response."""
209
+
210
+ status: Literal["ok"] = "ok"
211
+ environment: str = "python_code_review_env"
212
+ task_count: int = Field(default=0, ge=0)
 
openenv.yaml CHANGED
@@ -1,7 +1,20 @@
1
- spec_version: 1
2
- name: python_env
3
- type: space
4
- runtime: fastapi
5
- app: server.app:app
6
- port: 8000
7
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ spec_version: 1
2
+ name: python_code_review_env
3
+ type: space
4
+ runtime: fastapi
5
+ app: server.app:app
6
+ port: 8000
7
+
8
+ metadata:
9
+ description: "Production-grade Python code review and repair benchmark for OpenEnv"
10
+ domain: code-review
11
+ task_count: 3
12
+ task_ids:
13
+ - syntax-fix-easy
14
+ - bug-fix-medium
15
+ - optimization-hard
16
+ difficulty_levels:
17
+ - easy
18
+ - medium
19
+ - hard
20
+
openenv_python_env.egg-info/PKG-INFO CHANGED
@@ -1,10 +1,13 @@
1
  Metadata-Version: 2.4
2
  Name: openenv-python_env
3
- Version: 0.1.0
4
- Summary: Python Env environment for OpenEnv
5
  Requires-Python: >=3.10
6
  Requires-Dist: openenv-core[core]>=0.2.2
7
- Requires-Dist: pydantic>=2.12.5
 
 
 
8
  Provides-Extra: dev
9
  Requires-Dist: pytest>=8.0.0; extra == "dev"
10
  Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
 
1
  Metadata-Version: 2.4
2
  Name: openenv-python_env
3
+ Version: 0.2.0
4
+ Summary: Deterministic Python code review and repair benchmark environment for OpenEnv
5
  Requires-Python: >=3.10
6
  Requires-Dist: openenv-core[core]>=0.2.2
7
+ Requires-Dist: fastapi>=0.115.0
8
+ Requires-Dist: uvicorn>=0.30.0
9
+ Requires-Dist: openai>=1.40.0
10
+ Requires-Dist: pytest>=8.0.0
11
  Provides-Extra: dev
12
  Requires-Dist: pytest>=8.0.0; extra == "dev"
13
  Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
openenv_python_env.egg-info/SOURCES.txt CHANGED
@@ -1,11 +1,8 @@
1
  README.md
2
- __init__.py
3
- client.py
4
- inference.py
5
- models.py
6
  pyproject.toml
7
  ./__init__.py
8
  ./client.py
 
9
  ./inference.py
10
  ./models.py
11
  openenv_python_env.egg-info/PKG-INFO
@@ -16,4 +13,15 @@ openenv_python_env.egg-info/requires.txt
16
  openenv_python_env.egg-info/top_level.txt
17
  server/__init__.py
18
  server/app.py
19
- server/python_env_environment.py
 
 
 
 
 
 
 
 
 
 
 
 
1
  README.md
 
 
 
 
2
  pyproject.toml
3
  ./__init__.py
4
  ./client.py
5
+ ./compat.py
6
  ./inference.py
7
  ./models.py
8
  openenv_python_env.egg-info/PKG-INFO
 
13
  openenv_python_env.egg-info/top_level.txt
14
  server/__init__.py
15
  server/app.py
16
+ server/code_review_env_environment.py
17
+ server/code_review_environment.py
18
+ server/env.py
19
+ server/env_safe.py
20
+ server/grading.py
21
+ server/python_env_environment.py
22
+ server/static_review.py
23
+ server/task_bank.py
24
+ tests/test_api.py
25
+ tests/test_environment.py
26
+ tests/test_examples.py
27
+ tests/test_reward_dynamics.py
openenv_python_env.egg-info/requires.txt CHANGED
@@ -1,5 +1,8 @@
1
  openenv-core[core]>=0.2.2
2
- pydantic>=2.12.5
 
 
 
3
 
4
  [dev]
5
  pytest>=8.0.0
 
1
  openenv-core[core]>=0.2.2
2
+ fastapi>=0.115.0
3
+ uvicorn>=0.30.0
4
+ openai>=1.40.0
5
+ pytest>=8.0.0
6
 
7
  [dev]
8
  pytest>=8.0.0
pyproject.toml CHANGED
@@ -1,46 +1,33 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the BSD-style license found in the
5
- # LICENSE file in the root directory of this source tree.
6
-
7
- [build-system]
8
- requires = ["setuptools>=45", "wheel"]
9
- build-backend = "setuptools.build_meta"
10
-
11
- [project]
12
- name = "openenv-python_env"
13
- version = "0.1.0"
14
- description = "Python Env environment for OpenEnv"
15
- requires-python = ">=3.10"
16
- dependencies = [
17
- # Core OpenEnv runtime (provides FastAPI server + HTTP client types)
18
- # install from github
19
- # "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
20
- "openenv-core[core]>=0.2.2",
21
- # Environment-specific dependencies
22
- # Add all dependencies needed for your environment here
23
- # Examples:
24
- # "numpy>=1.19.0",
25
- # "torch>=2.0.0",
26
- # "gymnasium>=0.29.0",
27
- # "openspiel>=1.0.0",
28
- # "smolagents>=1.22.0,<2",
29
- "pydantic>=2.12.5",
30
- ]
31
-
32
- [project.optional-dependencies]
33
- dev = [
34
- "pytest>=8.0.0",
35
- "pytest-cov>=4.0.0",
36
- ]
37
-
38
- [project.scripts]
39
- # Server entry point - enables running via: uv run --project . server
40
- # or: python -m python_env.server.app
41
- server = "python_env.server.app:main"
42
-
43
- [tool.setuptools]
44
- include-package-data = true
45
- packages = ["python_env", "python_env.server"]
46
- package-dir = { "python_env" = ".", "python_env.server" = "server" }
 
1
+ [build-system]
2
+ requires = ["setuptools>=45", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "openenv-python_env"
7
+ version = "0.2.0"
8
+ description = "Deterministic Python code review and repair benchmark environment for OpenEnv"
9
+ requires-python = ">=3.10"
10
+ dependencies = [
11
+ "openenv-core[core]>=0.2.2",
12
+ "fastapi>=0.115.0",
13
+ "uvicorn>=0.30.0",
14
+ "openai>=1.40.0",
15
+ "pytest>=8.0.0",
16
+ ]
17
+
18
+ [project.optional-dependencies]
19
+ dev = [
20
+ "pytest>=8.0.0",
21
+ "pytest-cov>=4.0.0",
22
+ ]
23
+
24
+ [project.scripts]
25
+ server = "python_env.server.app:main"
26
+
27
+ [tool.setuptools]
28
+ include-package-data = true
29
+ packages = ["python_env", "python_env.server"]
30
+ package-dir = { "python_env" = ".", "python_env.server" = "server" }
31
+
32
+ [tool.pytest.ini_options]
33
+ testpaths = ["tests"]
 
 
 
 
 
 
 
 
 
 
 
 
 
pytest-cache-files-1f62ra1g/CACHEDIR.TAG ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Signature: 8a477f597d28d172789f06886806bc55
2
+ # This file is a cache directory tag created by pytest.
3
+ # For information about cache directory tags, see:
4
+ # https://bford.info/cachedir/spec.html
pytest-cache-files-1f62ra1g/README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # pytest cache directory #
2
+
3
+ This directory contains data from the pytest's cache plugin,
4
+ which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
5
+
6
+ **Do not** commit this to version control.
7
+
8
+ See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
pytest-cache-files-i2cpw3zw/CACHEDIR.TAG ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Signature: 8a477f597d28d172789f06886806bc55
2
+ # This file is a cache directory tag created by pytest.
3
+ # For information about cache directory tags, see:
4
+ # https://bford.info/cachedir/spec.html
pytest-cache-files-i2cpw3zw/README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # pytest cache directory #
2
+
3
+ This directory contains data from the pytest's cache plugin,
4
+ which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
5
+
6
+ **Do not** commit this to version control.
7
+
8
+ See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
pytest-cache-files-le0qcl0z/CACHEDIR.TAG ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Signature: 8a477f597d28d172789f06886806bc55
2
+ # This file is a cache directory tag created by pytest.
3
+ # For information about cache directory tags, see:
4
+ # https://bford.info/cachedir/spec.html
pytest-cache-files-le0qcl0z/README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # pytest cache directory #
2
+
3
+ This directory contains data from the pytest's cache plugin,
4
+ which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
5
+
6
+ **Do not** commit this to version control.
7
+
8
+ See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
pytest-cache-files-qm8xzmpt/CACHEDIR.TAG ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Signature: 8a477f597d28d172789f06886806bc55
2
+ # This file is a cache directory tag created by pytest.
3
+ # For information about cache directory tags, see:
4
+ # https://bford.info/cachedir/spec.html
pytest-cache-files-qm8xzmpt/README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # pytest cache directory #
2
+
3
+ This directory contains data from the pytest's cache plugin,
4
+ which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
5
+
6
+ **Do not** commit this to version control.
7
+
8
+ See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
pytest-cache-files-qun9v98v/CACHEDIR.TAG ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Signature: 8a477f597d28d172789f06886806bc55
2
+ # This file is a cache directory tag created by pytest.
3
+ # For information about cache directory tags, see:
4
+ # https://bford.info/cachedir/spec.html
pytest-cache-files-qun9v98v/README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # pytest cache directory #
2
+
3
+ This directory contains data from the pytest's cache plugin,
4
+ which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
5
+
6
+ **Do not** commit this to version control.
7
+
8
+ See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
pytest-cache-files-srp2otxc/CACHEDIR.TAG ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Signature: 8a477f597d28d172789f06886806bc55
2
+ # This file is a cache directory tag created by pytest.
3
+ # For information about cache directory tags, see:
4
+ # https://bford.info/cachedir/spec.html
pytest-cache-files-srp2otxc/README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # pytest cache directory #
2
+
3
+ This directory contains data from the pytest's cache plugin,
4
+ which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
5
+
6
+ **Do not** commit this to version control.
7
+
8
+ See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
pytest-cache-files-u6t7g29i/CACHEDIR.TAG ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Signature: 8a477f597d28d172789f06886806bc55
2
+ # This file is a cache directory tag created by pytest.
3
+ # For information about cache directory tags, see:
4
+ # https://bford.info/cachedir/spec.html
pytest-cache-files-u6t7g29i/README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # pytest cache directory #
2
+
3
+ This directory contains data from the pytest's cache plugin,
4
+ which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
5
+
6
+ **Do not** commit this to version control.
7
+
8
+ See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
pytest-cache-files-x1yzwik9/CACHEDIR.TAG ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Signature: 8a477f597d28d172789f06886806bc55
2
+ # This file is a cache directory tag created by pytest.
3
+ # For information about cache directory tags, see:
4
+ # https://bford.info/cachedir/spec.html
pytest-cache-files-x1yzwik9/README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # pytest cache directory #
2
+
3
+ This directory contains data from the pytest's cache plugin,
4
+ which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
5
+
6
+ **Do not** commit this to version control.
7
+
8
+ See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
server/__init__.py CHANGED
@@ -1,11 +1,5 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the BSD-style license found in the
5
- # LICENSE file in the root directory of this source tree.
6
-
7
- """Python Env environment server components."""
8
-
9
- from .python_env_environment import PythonEnvironment
10
-
11
- __all__ = ["PythonEnvironment"]
 
1
+ """Server exports for the Python code review environment."""
2
+
3
+ from .code_review_environment import CodeReviewEnvironment, PythonCodeReviewEnvironment, PythonEnvironment
4
+
5
+ __all__ = ["PythonEnvironment", "PythonCodeReviewEnvironment", "CodeReviewEnvironment"]
 
 
 
 
 
 
server/app.py CHANGED
@@ -1,84 +1,117 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the BSD-style license found in the
5
- # LICENSE file in the root directory of this source tree.
6
-
7
- """
8
- FastAPI application for the Python Env Environment.
9
-
10
- This module creates an HTTP server that exposes the PythonEnvironment
11
- over HTTP and WebSocket endpoints, compatible with EnvClient.
12
-
13
- Endpoints:
14
- - POST /reset: Reset the environment
15
- - POST /step: Execute an action
16
- - GET /state: Get current environment state
17
- - GET /schema: Get action/observation schemas
18
- - WS /ws: WebSocket endpoint for persistent sessions
19
-
20
- Usage:
21
- # Development (with auto-reload):
22
- uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
23
-
24
- # Production:
25
- uvicorn server.app:app --host 0.0.0.0 --port 8000 --workers 4
26
-
27
- # Or run directly:
28
- python -m server.app
29
- """
30
-
31
  try:
32
- from openenv.core.env_server.http_server import create_app
33
- except Exception as e: # pragma: no cover
34
- raise ImportError(
35
- "openenv is required for the web interface. Install dependencies with '\n uv sync\n'"
36
- ) from e
37
-
38
- try:
39
- from ..models import PythonAction, PythonObservation
40
- from .python_env_environment import PythonEnvironment
41
- except ImportError:
42
- from models import PythonAction, PythonObservation
43
- from server.python_env_environment import PythonEnvironment
44
-
45
 
46
- # Create the app with web interface and README integration
47
  app = create_app(
48
- PythonEnvironment,
49
- PythonAction,
50
- PythonObservation,
51
- env_name="python_env",
52
- max_concurrent_envs=1, # increase this number to allow more concurrent WebSocket sessions
53
- )
54
-
55
-
56
- def main(host: str = "0.0.0.0", port: int = 8000):
57
- """
58
- Entry point for direct execution via uv run or python -m.
59
-
60
- This function enables running the server without Docker:
61
- uv run --project . server
62
- uv run --project . server --port 8001
63
- python -m python_env.server.app
64
-
65
- Args:
66
- host: Host address to bind to (default: "0.0.0.0")
67
- port: Port number to listen on (default: 8000)
68
-
69
- For production deployments, consider using uvicorn directly with
70
- multiple workers:
71
- uvicorn python_env.server.app:app --workers 4
72
- """
73
- import uvicorn
74
-
75
- uvicorn.run(app, host=host, port=port)
76
-
77
-
78
- if __name__ == "__main__":
79
- import argparse
80
-
81
- parser = argparse.ArgumentParser()
82
- parser.add_argument("--port", type=int, default=8000)
83
- args = parser.parse_args()
84
- main(port=args.port)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """FastAPI application for the Python code review environment."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+
7
+ from fastapi import APIRouter, HTTPException
8
+ from fastapi.responses import RedirectResponse
9
+
10
+ from compat import create_app
11
+
12
+ from models import (
13
+ HealthResponse,
14
+ PythonCodeReviewAction,
15
+ PythonCodeReviewObservation,
16
+ PythonCodeReviewState,
17
+ TaskDescriptor,
18
+ TaskGrade,
19
+ )
20
+ from server.env import PythonCodeReviewEnvironment
21
+
22
+
 
 
 
 
 
 
 
 
23
  try:
24
+ MAX_CONCURRENT_ENVS = max(int(os.getenv("MAX_CONCURRENT_ENVS", "16")), 1)
25
+ except Exception:
26
+ MAX_CONCURRENT_ENVS = 16
 
 
 
 
 
 
 
 
 
 
27
 
28
+ python_env = PythonCodeReviewEnvironment(verbose=False)
29
  app = create_app(
30
+ PythonCodeReviewEnvironment,
31
+ PythonCodeReviewAction,
32
+ PythonCodeReviewObservation,
33
+ max_concurrent_envs=MAX_CONCURRENT_ENVS,
34
+ )
35
+ router = APIRouter(tags=["python-code-review"])
36
+
37
+
38
+ @router.get("/", include_in_schema=False)
39
+ def root() -> RedirectResponse:
40
+ """Redirect root to API documentation."""
41
+ return RedirectResponse(url="/docs")
42
+
43
+
44
+ @router.get("/health", response_model=HealthResponse)
45
+ def health() -> HealthResponse:
46
+ """Health check endpoint for deployment monitoring."""
47
+ return python_env.health()
48
+
49
+
50
+ @router.get("/tasks", response_model=list)
51
+ def list_tasks() -> list:
52
+ """List all available deterministic tasks."""
53
+ return python_env.list_task_summaries()
54
+
55
+
56
+ @router.get("/tasks/{task_id}", response_model=object)
57
+ def get_task(task_id: str) -> object:
58
+ """Get a specific task by ID."""
59
+ try:
60
+ return python_env.get_task(task_id)
61
+ except ValueError as exc:
62
+ raise HTTPException(status_code=404, detail=str(exc)) from exc
63
+
64
+
65
+ @router.post("/tasks/{task_id}/grade", response_model=TaskGrade)
66
+ def grade_task(task_id: str, payload: PythonCodeReviewAction) -> TaskGrade:
67
+ """Grade code submission for a task without running an episode."""
68
+ if payload.action_type != "edit_code" or not payload.code:
69
+ raise HTTPException(
70
+ status_code=400,
71
+ detail="Requires action_type='edit_code' with code parameter."
72
+ )
73
+ try:
74
+ return python_env.grade_task_submission(task_id=task_id, code=payload.code)
75
+ except ValueError as exc:
76
+ raise HTTPException(status_code=404, detail=str(exc)) from exc
77
+
78
+
79
+ @router.post("/state", response_model=PythonCodeReviewState)
80
+ def get_state_post() -> RedirectResponse:
81
+ """Redirect POST /state to GET for compatibility."""
82
+ return RedirectResponse(url="/state", status_code=303)
83
+
84
+
85
+ app.include_router(router)
86
+
87
+
88
+ def _prioritize_route(path: str, methods: set[str]) -> None:
89
+ """Move a matching custom route ahead of default OpenEnv routes."""
90
+ try:
91
+ for index in range(len(app.router.routes) - 1, -1, -1):
92
+ route = app.router.routes[index]
93
+ route_path = getattr(route, "path", None)
94
+ route_methods = set(getattr(route, "methods", set()) or set())
95
+ if route_path == path and methods.issubset(route_methods):
96
+ app.router.routes.insert(0, app.router.routes.pop(index))
97
+ break
98
+ except Exception:
99
+ pass
100
+
101
+
102
+ _prioritize_route("/health", {"GET"})
103
+
104
+
105
+ def main(host: str = "0.0.0.0", port: int = 8000) -> None:
106
+ """Run the FastAPI application with uvicorn."""
107
+ import uvicorn
108
+ uvicorn.run(
109
+ app,
110
+ host=os.getenv("HOST", host),
111
+ port=int(os.getenv("PORT", str(port))),
112
+ )
113
+
114
+
115
+ if __name__ == "__main__":
116
+ main()
117
+
server/code_review_env_environment.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ """Compatibility shim for older imports."""
2
+
3
+ try:
4
+ from server.code_review_environment import CodeReviewEnvironment
5
+ except ModuleNotFoundError: # pragma: no cover
6
+ from .code_review_environment import CodeReviewEnvironment
7
+
8
+
9
+ __all__ = ["CodeReviewEnvironment"]
server/code_review_environment.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """Compatibility wrapper for older imports."""
2
+
3
+ from .env import CodeReviewEnvironment, PythonCodeReviewEnvironment, PythonEnvironment
4
+
5
+ __all__ = ["CodeReviewEnvironment", "PythonCodeReviewEnvironment", "PythonEnvironment"]
server/env.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .env_safe import * # noqa: F401,F403
server/env_safe.py ADDED
@@ -0,0 +1,492 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Safe OpenEnv environment for deterministic Python code repair tasks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Optional
6
+ from uuid import uuid4
7
+
8
+ from compat import Environment
9
+ from graders import grade_task
10
+ from models import (
11
+ HealthResponse,
12
+ HistoryEntry,
13
+ PythonCodeReviewAction,
14
+ PythonCodeReviewObservation,
15
+ PythonCodeReviewState,
16
+ RewardDetails,
17
+ TaskGrade,
18
+ )
19
+ from tasks import TaskSpec, get_task as load_task, list_task_summaries, task_ids
20
+
21
+
22
+ INVALID_ACTION_PENALTY = 0.10
23
+ NO_PROGRESS_PENALTY = 0.08
24
+ REPEATED_ACTION_PENALTY = 0.05
25
+ BASE_STEP_PENALTY = 0.02
26
+ ANALYZE_STEP_PENALTY = 0.01
27
+ SUBMIT_COMPLETION_BONUS = 0.30
28
+ TIMEOUT_PENALTY = 0.12
29
+ VALID_ACTIONS = {"analyze_code", "edit_code", "run_tests", "submit_solution"}
30
+
31
+
32
+ def _clamp(value: float, low: float = 0.0, high: float = 1.0) -> float:
33
+ """Clamp a scalar to a bounded numeric interval."""
34
+ try:
35
+ return max(low, min(high, float(value)))
36
+ except Exception:
37
+ return low
38
+
39
+
40
+ def _safe_text(value: Any, default: str = "") -> str:
41
+ """Convert values into short stable strings."""
42
+ try:
43
+ text = str(value)
44
+ except Exception:
45
+ return default
46
+ text = " ".join(text.split())
47
+ return text[:240] if text else default
48
+
49
+
50
+ class PythonCodeReviewEnvironment(
51
+ Environment[PythonCodeReviewAction, PythonCodeReviewObservation, PythonCodeReviewState]
52
+ ):
53
+ """Deterministic, bounded, evaluator-safe environment for code repair tasks."""
54
+
55
+ SUPPORTS_CONCURRENT_SESSIONS = True
56
+
57
+ def __init__(self, verbose: bool = False) -> None:
58
+ super().__init__()
59
+ self._verbose = bool(verbose)
60
+ self._task_order = self._safe_task_order()
61
+ self._task_cursor = -1
62
+ self._task: Optional[TaskSpec] = None
63
+ self._state = PythonCodeReviewState(episode_id=str(uuid4()))
64
+ self._done = False
65
+ self._last_status = "Call reset() to start."
66
+ self._last_reward = RewardDetails(value=0.0, reason="Environment initialized.")
67
+ self._metrics = self._blank_metrics()
68
+ self._last_action_type = ""
69
+
70
+ def reset(
71
+ self,
72
+ seed: Optional[int] = None,
73
+ episode_id: Optional[str] = None,
74
+ task_id: Optional[str] = None,
75
+ **_: object,
76
+ ) -> PythonCodeReviewObservation:
77
+ """Reset the environment for a deterministic task and return an observation."""
78
+ del seed
79
+ try:
80
+ self._reset_rubric()
81
+ except Exception:
82
+ pass
83
+
84
+ task = self._select_task(task_id)
85
+ self._task = task
86
+ self._done = False
87
+ self._metrics = self._blank_metrics()
88
+ self._last_action_type = ""
89
+ self._last_status = "Inspect the code, run checks, edit the code, then submit."
90
+ self._last_reward = RewardDetails(
91
+ value=0.0,
92
+ reason="Episode reset.",
93
+ prev_score=0.0,
94
+ curr_score=0.0,
95
+ )
96
+ self._state = PythonCodeReviewState(
97
+ episode_id=episode_id or str(uuid4()),
98
+ step_count=0,
99
+ task_id=task.task_id,
100
+ difficulty=task.difficulty,
101
+ task_kind=task.task_kind,
102
+ attempts_remaining=max(int(task.max_steps), 1),
103
+ current_code=task.starter_code,
104
+ errors="",
105
+ test_results="No checks run yet.",
106
+ history=[],
107
+ score=0.0,
108
+ done=False,
109
+ )
110
+ return self._build_observation()
111
+
112
+ def step(
113
+ self,
114
+ action: PythonCodeReviewAction,
115
+ timeout_s: Optional[float] = None,
116
+ **_: object,
117
+ ) -> PythonCodeReviewObservation:
118
+ """Execute one safe environment step and always return a valid observation."""
119
+ del timeout_s
120
+ try:
121
+ if self._task is None:
122
+ return self.reset()
123
+
124
+ if self._done:
125
+ self._last_status = "Episode already completed. Call reset() to continue."
126
+ self._last_reward = RewardDetails(
127
+ value=-INVALID_ACTION_PENALTY,
128
+ invalid_action_penalty=INVALID_ACTION_PENALTY,
129
+ reason="Episode already completed.",
130
+ prev_score=self._metrics["score"],
131
+ curr_score=self._metrics["score"],
132
+ code_changed=False,
133
+ )
134
+ return self._build_observation()
135
+
136
+ self._state.step_count += 1
137
+ action_type = _safe_text(getattr(action, "action_type", "analyze_code"), "analyze_code")
138
+ code = getattr(action, "code", None)
139
+
140
+ if action_type == "analyze_code":
141
+ self._handle_scored_action(action_type=action_type, candidate_code=self._state.current_code, include_hidden=False)
142
+ elif action_type == "run_tests":
143
+ self._handle_scored_action(action_type=action_type, candidate_code=self._state.current_code, include_hidden=False)
144
+ elif action_type == "edit_code":
145
+ self._handle_edit(code)
146
+ elif action_type == "submit_solution":
147
+ self._handle_scored_action(action_type=action_type, candidate_code=self._state.current_code, include_hidden=True)
148
+ self._done = True
149
+ else:
150
+ self._apply_invalid_action(f"Unsupported action_type '{action_type}'.")
151
+
152
+ self._state.attempts_remaining = max(self._task.max_steps - self._state.step_count, 0)
153
+ if self._state.attempts_remaining == 0 and not self._done:
154
+ self._auto_submit()
155
+
156
+ self._state.done = self._done
157
+ return self._build_observation()
158
+ except Exception as exc:
159
+ self._apply_invalid_action(f"Step failure handled: {_safe_text(exc, 'unknown_error')}")
160
+ self._state.done = self._done
161
+ return self._build_observation()
162
+
163
+ @property
164
+ def state(self) -> PythonCodeReviewState:
165
+ """Return a deep copy of the current environment state."""
166
+ try:
167
+ return self._state.model_copy(deep=True)
168
+ except Exception:
169
+ return PythonCodeReviewState(episode_id=str(uuid4()))
170
+
171
+ def list_task_summaries(self) -> list[object]:
172
+ """Return public task summaries."""
173
+ try:
174
+ return list_task_summaries()
175
+ except Exception:
176
+ return []
177
+
178
+ def get_task(self, task_id: str) -> object:
179
+ """Return a single public task descriptor."""
180
+ return self._select_task(task_id).to_descriptor()
181
+
182
+ def health(self) -> HealthResponse:
183
+ """Return a simple health response."""
184
+ return HealthResponse(task_count=len(self._task_order))
185
+
186
+ def grade_task_submission(self, task_id: str, code: str) -> TaskGrade:
187
+ """Grade a task submission outside an episode without raising."""
188
+ try:
189
+ task = self._select_task(task_id)
190
+ return self._safe_grade(task=task, candidate_code=code, include_hidden=True)
191
+ except Exception as exc:
192
+ return TaskGrade(score=0.0, details={"error": _safe_text(exc, "grading_failed")})
193
+
194
+ def run_tests(self, code: str, include_hidden: bool = False) -> tuple[float, dict[str, int], TaskGrade]:
195
+ """Run deterministic grading and return score plus test summary."""
196
+ task = self._task or self._select_task(None)
197
+ grade = self._safe_grade(task=task, candidate_code=code, include_hidden=include_hidden)
198
+ return (
199
+ _clamp(grade.score),
200
+ {"passed": int(grade.tests_passed), "total": int(grade.tests_total)},
201
+ grade,
202
+ )
203
+
204
+ def apply_action(self, action: PythonCodeReviewAction) -> str:
205
+ """Return the candidate code implied by the action."""
206
+ if getattr(action, "action_type", "") == "edit_code":
207
+ code = getattr(action, "code", None)
208
+ return str(code) if code is not None else self._state.current_code
209
+ return self._state.current_code
210
+
211
+ def compute_reward(
212
+ self,
213
+ action_type: str,
214
+ previous_metrics: dict[str, float],
215
+ current_metrics: dict[str, float],
216
+ grade: TaskGrade,
217
+ code_changed: bool,
218
+ invalid_action: bool = False,
219
+ ) -> RewardDetails:
220
+ """Compute a bounded dynamic reward with progress and efficiency shaping."""
221
+ prev_score = _clamp(previous_metrics.get("score", 0.0))
222
+ curr_score = _clamp(current_metrics.get("score", 0.0))
223
+ score_delta = curr_score - prev_score
224
+ test_delta = current_metrics.get("test_fraction", 0.0) - previous_metrics.get("test_fraction", 0.0)
225
+ syntax_delta = current_metrics.get("syntax_score", 0.0) - previous_metrics.get("syntax_score", 0.0)
226
+ quality_delta = current_metrics.get("quality_score", 0.0) - previous_metrics.get("quality_score", 0.0)
227
+
228
+ step_penalty = BASE_STEP_PENALTY + (ANALYZE_STEP_PENALTY if action_type == "analyze_code" else 0.0)
229
+ repeated_penalty = REPEATED_ACTION_PENALTY if action_type == self._last_action_type else 0.0
230
+ no_progress = (
231
+ score_delta <= 1e-9
232
+ and test_delta <= 1e-9
233
+ and syntax_delta <= 1e-9
234
+ and quality_delta <= 1e-9
235
+ and not code_changed
236
+ )
237
+ stagnation_penalty = NO_PROGRESS_PENALTY if no_progress and not invalid_action else 0.0
238
+ regression_penalty = max(-score_delta, 0.0) * 0.6 + repeated_penalty + step_penalty
239
+ invalid_penalty = INVALID_ACTION_PENALTY if invalid_action else 0.0
240
+ timeout_penalty = TIMEOUT_PENALTY if bool(grade.timed_out) else 0.0
241
+
242
+ progress_reward = max(score_delta, 0.0) * 0.7
243
+ syntax_reward = max(syntax_delta, 0.0) * 0.5
244
+ test_reward = max(test_delta, 0.0) * 1.0
245
+ quality_bonus = max(quality_delta, 0.0) * 0.2
246
+ correctness_bonus = SUBMIT_COMPLETION_BONUS if action_type == "submit_solution" and curr_score >= 0.999 else 0.0
247
+
248
+ reward_value = (
249
+ progress_reward
250
+ + syntax_reward
251
+ + test_reward
252
+ + quality_bonus
253
+ + correctness_bonus
254
+ - stagnation_penalty
255
+ - regression_penalty
256
+ - invalid_penalty
257
+ - timeout_penalty
258
+ )
259
+ reward_value = max(-1.0, min(1.0, round(reward_value, 6)))
260
+ return RewardDetails(
261
+ value=reward_value,
262
+ syntax_reward=round(syntax_reward, 6),
263
+ test_reward=round(test_reward, 6),
264
+ quality_bonus=round(quality_bonus, 6),
265
+ correctness_bonus=round(correctness_bonus, 6),
266
+ progress_delta=round(progress_reward, 6),
267
+ stagnation_penalty=round(stagnation_penalty, 6),
268
+ regression_penalty=round(regression_penalty, 6),
269
+ invalid_action_penalty=round(invalid_penalty, 6),
270
+ timeout_penalty=round(timeout_penalty, 6),
271
+ reason=f"{action_type} reward computed safely",
272
+ prev_score=round(prev_score, 6),
273
+ curr_score=round(curr_score, 6),
274
+ code_changed=bool(code_changed),
275
+ )
276
+
277
+ def _safe_task_order(self) -> list[str]:
278
+ """Load deterministic task ids with a hard fallback."""
279
+ try:
280
+ loaded = list(task_ids())
281
+ if loaded:
282
+ return [str(task_id) for task_id in loaded]
283
+ except Exception:
284
+ pass
285
+ return ["syntax-fix-easy", "bug-fix-medium", "optimization-hard"]
286
+
287
+ def _blank_metrics(self) -> dict[str, float]:
288
+ """Return an empty metric snapshot."""
289
+ return {
290
+ "score": 0.0,
291
+ "test_fraction": 0.0,
292
+ "syntax_score": 0.0,
293
+ "quality_score": 0.0,
294
+ }
295
+
296
+ def _select_task(self, task_id: Optional[str]) -> TaskSpec:
297
+ """Select the requested task or advance deterministically."""
298
+ try:
299
+ if task_id:
300
+ task = load_task(task_id)
301
+ if task.task_id in self._task_order:
302
+ self._task_cursor = self._task_order.index(task.task_id)
303
+ return task
304
+ except Exception:
305
+ pass
306
+
307
+ try:
308
+ self._task_cursor = (self._task_cursor + 1) % len(self._task_order)
309
+ return load_task(self._task_order[self._task_cursor])
310
+ except Exception:
311
+ return load_task("syntax-fix-easy")
312
+
313
+ def _safe_grade(self, task: TaskSpec, candidate_code: str, include_hidden: bool) -> TaskGrade:
314
+ """Run grading without allowing exceptions to escape."""
315
+ try:
316
+ return grade_task(candidate_code, task, include_hidden=include_hidden)
317
+ except Exception as exc:
318
+ return TaskGrade(
319
+ score=0.0,
320
+ syntax_score=0.0,
321
+ tests_passed=0,
322
+ tests_total=max(len(task.visible_tests), 1),
323
+ details={"compile_error": "", "error": _safe_text(exc, "grading_failed")},
324
+ )
325
+
326
+ def _metrics_from_grade(self, grade: TaskGrade) -> dict[str, float]:
327
+ """Derive normalized reward metrics from a grading result."""
328
+ tests_total = max(int(grade.tests_total), 0)
329
+ tests_passed = max(int(grade.tests_passed), 0)
330
+ test_fraction = (tests_passed / tests_total) if tests_total else _clamp(grade.syntax_score)
331
+ return {
332
+ "score": _clamp(grade.score),
333
+ "test_fraction": _clamp(test_fraction),
334
+ "syntax_score": _clamp(grade.syntax_score),
335
+ "quality_score": _clamp(grade.quality_score),
336
+ }
337
+
338
+ def _format_test_results(self, grade: TaskGrade, include_hidden: bool) -> str:
339
+ """Format test execution results for the observation."""
340
+ compile_error = _safe_text(grade.details.get("compile_error", ""), "")
341
+ scope = "all checks" if include_hidden else "visible checks"
342
+ if compile_error:
343
+ return f"{scope}: compile error: {compile_error}"
344
+ if grade.timed_out:
345
+ return f"{scope}: execution timed out"
346
+ if self._task and self._task.task_kind == "syntax_fix":
347
+ return "visible checks: code compiles successfully"
348
+ return f"{scope}: {int(grade.tests_passed)}/{int(grade.tests_total)} passing"
349
+
350
+ def _build_status(self, action_type: str, grade: TaskGrade) -> str:
351
+ """Build a human-readable status message."""
352
+ if action_type == "submit_solution":
353
+ return f"Solution submitted. Final score: {_clamp(grade.score):.3f}"
354
+ if action_type == "edit_code":
355
+ if grade.details.get("compile_error"):
356
+ return "Code updated, but syntax issues remain."
357
+ return "Code updated and evaluated."
358
+ if action_type == "run_tests":
359
+ return "Test run completed."
360
+ if action_type == "analyze_code":
361
+ return "Analysis completed."
362
+ return "Action handled safely."
363
+
364
+ def _apply_grade_to_state(self, grade: TaskGrade, include_hidden: bool) -> None:
365
+ """Update environment state from the latest grading result."""
366
+ compile_error = _safe_text(grade.details.get("compile_error", ""), "")
367
+ self._state.score = _clamp(grade.score)
368
+ self._state.errors = compile_error
369
+ self._state.test_results = self._format_test_results(grade, include_hidden=include_hidden)
370
+
371
+ def _handle_scored_action(self, action_type: str, candidate_code: str, include_hidden: bool) -> None:
372
+ """Grade code, update state, and compute reward for a valid action."""
373
+ task = self._task or self._select_task(None)
374
+ previous_metrics = dict(self._metrics)
375
+ prior_code = self._state.current_code
376
+ code_changed = candidate_code.strip() != prior_code.strip()
377
+ if action_type == "edit_code":
378
+ self._state.current_code = candidate_code
379
+ grade = self._safe_grade(task=task, candidate_code=self._state.current_code, include_hidden=include_hidden)
380
+ current_metrics = self._metrics_from_grade(grade)
381
+ self._apply_grade_to_state(grade, include_hidden=include_hidden)
382
+ self._last_reward = self.compute_reward(
383
+ action_type=action_type,
384
+ previous_metrics=previous_metrics,
385
+ current_metrics=current_metrics,
386
+ grade=grade,
387
+ code_changed=code_changed,
388
+ invalid_action=False,
389
+ )
390
+ self._last_status = self._build_status(action_type, grade)
391
+ self._metrics = current_metrics
392
+ self._last_action_type = action_type
393
+ self._append_history(action_type, self._last_status, self._last_reward.value)
394
+
395
+ def _handle_edit(self, code: Optional[str]) -> None:
396
+ """Validate edit input and evaluate the new candidate code."""
397
+ safe_code = (code or "").strip()
398
+ if not safe_code:
399
+ self._apply_invalid_action("edit_code requires code parameter.")
400
+ return
401
+ self._handle_scored_action(action_type="edit_code", candidate_code=safe_code, include_hidden=False)
402
+
403
+ def _apply_invalid_action(self, reason: str) -> None:
404
+ """Record an invalid action without crashing the episode."""
405
+ previous_metrics = dict(self._metrics)
406
+ grade = TaskGrade(score=previous_metrics["score"], syntax_score=previous_metrics["syntax_score"])
407
+ self._last_reward = self.compute_reward(
408
+ action_type="invalid",
409
+ previous_metrics=previous_metrics,
410
+ current_metrics=previous_metrics,
411
+ grade=grade,
412
+ code_changed=False,
413
+ invalid_action=True,
414
+ )
415
+ self._last_status = reason
416
+ self._append_history("analyze_code", reason, self._last_reward.value)
417
+
418
+ def _auto_submit(self) -> None:
419
+ """Finalize the episode when attempts are exhausted."""
420
+ task = self._task or self._select_task(None)
421
+ grade = self._safe_grade(task=task, candidate_code=self._state.current_code, include_hidden=True)
422
+ self._apply_grade_to_state(grade, include_hidden=True)
423
+ self._done = True
424
+ self._state.done = True
425
+ self._last_status = f"Auto-submitted. Final score: {_clamp(grade.score):.3f}"
426
+
427
+ def _append_history(self, action_type: str, status: str, reward: float) -> None:
428
+ """Append one action record to the episode history."""
429
+ try:
430
+ stable_action = action_type if action_type in VALID_ACTIONS else "analyze_code"
431
+ self._state.history.append(
432
+ HistoryEntry(
433
+ step=max(int(self._state.step_count), 0),
434
+ action_type=stable_action,
435
+ status=_safe_text(status, "handled"),
436
+ reward=float(reward),
437
+ )
438
+ )
439
+ except Exception:
440
+ pass
441
+
442
+ def _build_observation(self) -> PythonCodeReviewObservation:
443
+ """Build a valid observation from current state."""
444
+ task = self._task
445
+ try:
446
+ return PythonCodeReviewObservation(
447
+ task_id=self._state.task_id or "",
448
+ title=task.title if task else "",
449
+ difficulty=self._state.difficulty or "easy",
450
+ task_kind=self._state.task_kind,
451
+ task_description=task.task_description if task else "",
452
+ current_code=self._state.current_code,
453
+ errors=self._state.errors,
454
+ test_results=self._state.test_results,
455
+ visible_tests=list(task.visible_tests) if task else [],
456
+ history=list(self._state.history),
457
+ attempts_remaining=max(int(self._state.attempts_remaining), 0),
458
+ last_action_status=self._last_status,
459
+ score=_clamp(self._state.score),
460
+ reward_details=self._last_reward,
461
+ reward=self._last_reward.value,
462
+ done=bool(self._state.done),
463
+ metadata={
464
+ "prev_score": self._last_reward.prev_score,
465
+ "curr_score": self._last_reward.curr_score,
466
+ },
467
+ )
468
+ except Exception as exc:
469
+ return PythonCodeReviewObservation(
470
+ task_id=self._state.task_id or "",
471
+ title="",
472
+ difficulty="easy",
473
+ task_kind=None,
474
+ task_description="",
475
+ current_code=getattr(self._state, "current_code", ""),
476
+ errors=_safe_text(exc, "observation_build_failed"),
477
+ test_results="visible checks: unavailable",
478
+ visible_tests=[],
479
+ history=[],
480
+ attempts_remaining=0,
481
+ last_action_status="Observation fallback returned safely.",
482
+ score=0.0,
483
+ reward_details=RewardDetails(value=0.0, reason="Observation fallback."),
484
+ reward=0.0,
485
+ done=bool(getattr(self._state, "done", False)),
486
+ metadata={},
487
+ )
488
+
489
+
490
+ PythonEnvironment = PythonCodeReviewEnvironment
491
+ CodeReviewEnvironment = PythonCodeReviewEnvironment
492
+
server/grading.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Deterministic grading helpers for PR-review tasks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from dataclasses import dataclass
7
+ from typing import Iterable, List, Optional, Sequence, Set
8
+
9
+ try:
10
+ from models import ReviewFinding, TaskGrade
11
+ from server.task_bank import RubricIssue, TaskSpec
12
+ except ModuleNotFoundError: # pragma: no cover
13
+ from ..models import ReviewFinding, TaskGrade
14
+ from .task_bank import RubricIssue, TaskSpec
15
+
16
+
17
+ FALSE_POSITIVE_PENALTY = 0.10
18
+ DUPLICATE_PENALTY = 0.05
19
+
20
+
21
+ @dataclass(frozen=True)
22
+ class FindingMatch:
23
+ """Result of matching one finding against the rubric."""
24
+
25
+ issue_id: Optional[str]
26
+ duplicate: bool = False
27
+
28
+
29
+ def finding_fingerprint(finding: ReviewFinding) -> str:
30
+ """Build a deterministic fingerprint for duplicate detection."""
31
+
32
+ text = " ".join(
33
+ [
34
+ finding.file_path,
35
+ str(finding.line or 0),
36
+ finding.category,
37
+ finding.severity,
38
+ finding.title,
39
+ finding.explanation,
40
+ finding.suggested_fix,
41
+ ]
42
+ )
43
+ return "|".join(sorted(tokens(text)))
44
+
45
+
46
+ def match_finding(
47
+ finding: ReviewFinding,
48
+ task: TaskSpec,
49
+ matched_issue_ids: Set[str],
50
+ seen_fingerprints: Set[str],
51
+ ) -> FindingMatch:
52
+ """Match one finding against the remaining rubric issues."""
53
+
54
+ fingerprint = finding_fingerprint(finding)
55
+ if fingerprint in seen_fingerprints:
56
+ return FindingMatch(issue_id=None, duplicate=True)
57
+
58
+ for issue in task.rubric_issues:
59
+ if issue.issue_id in matched_issue_ids:
60
+ continue
61
+ if finding_matches_issue(finding, issue):
62
+ return FindingMatch(issue_id=issue.issue_id)
63
+ return FindingMatch(issue_id=None)
64
+
65
+
66
+ def finding_matches_issue(finding: ReviewFinding, issue: RubricIssue) -> bool:
67
+ """Return True when a finding deterministically matches a rubric issue."""
68
+
69
+ if finding.file_path != issue.file_path:
70
+ return False
71
+ if finding.category != issue.category:
72
+ return False
73
+ if finding.severity != issue.severity:
74
+ return False
75
+ if finding.line is None or abs(finding.line - issue.line) > 2:
76
+ return False
77
+
78
+ finding_tokens = tokens(
79
+ " ".join([finding.title, finding.explanation, finding.suggested_fix])
80
+ )
81
+ keyword_hits = sum(1 for keyword in issue.keywords if keyword in finding_tokens)
82
+ return keyword_hits >= issue.min_keyword_hits
83
+
84
+
85
+ def score_task(
86
+ task: TaskSpec,
87
+ matched_issue_ids: Iterable[str],
88
+ false_positives: int = 0,
89
+ duplicate_findings: int = 0,
90
+ ) -> TaskGrade:
91
+ """Score a task from cumulative episode state."""
92
+
93
+ matched_set = set(matched_issue_ids)
94
+ matched_weight = sum(
95
+ issue.weight for issue in task.rubric_issues if issue.issue_id in matched_set
96
+ )
97
+ raw_score = matched_weight
98
+ raw_score -= false_positives * FALSE_POSITIVE_PENALTY
99
+ raw_score -= duplicate_findings * DUPLICATE_PENALTY
100
+ score = max(0.0, min(1.0, round(raw_score, 6)))
101
+ return TaskGrade(
102
+ score=score,
103
+ matched_issue_ids=sorted(matched_set),
104
+ false_positives=false_positives,
105
+ duplicate_findings=duplicate_findings,
106
+ matched_weight=min(1.0, round(matched_weight, 6)),
107
+ )
108
+
109
+
110
+ def grade_findings(task: TaskSpec, findings: Sequence[ReviewFinding]) -> TaskGrade:
111
+ """Offline-grade a batch of findings for one task."""
112
+
113
+ matched_issue_ids: Set[str] = set()
114
+ seen_fingerprints: Set[str] = set()
115
+ false_positives = 0
116
+ duplicate_findings = 0
117
+
118
+ for finding in findings:
119
+ result = match_finding(
120
+ finding=finding,
121
+ task=task,
122
+ matched_issue_ids=matched_issue_ids,
123
+ seen_fingerprints=seen_fingerprints,
124
+ )
125
+ fingerprint = finding_fingerprint(finding)
126
+ if result.duplicate:
127
+ duplicate_findings += 1
128
+ continue
129
+ seen_fingerprints.add(fingerprint)
130
+ if result.issue_id is None:
131
+ false_positives += 1
132
+ continue
133
+ matched_issue_ids.add(result.issue_id)
134
+
135
+ return score_task(
136
+ task=task,
137
+ matched_issue_ids=matched_issue_ids,
138
+ false_positives=false_positives,
139
+ duplicate_findings=duplicate_findings,
140
+ )
141
+
142
+
143
+ def tokens(text: str) -> Set[str]:
144
+ """Normalize free text into deterministic comparison tokens."""
145
+
146
+ return set(re.findall(r"[a-z0-9_]+", text.lower()))
147
+
server/python_env_environment.py CHANGED
@@ -1,421 +1,9 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the BSD-style license found in the
5
- # LICENSE file in the root directory of this source tree.
6
-
7
- """Python code-review environment implementation."""
8
-
9
- from __future__ import annotations
10
-
11
- from dataclasses import dataclass
12
- from datetime import UTC, datetime
13
- from typing import Dict, Iterable, List, Optional
14
- from uuid import uuid4
15
-
16
- from openenv.core.env_server.interfaces import Environment
17
- from openenv.core.env_server.types import State
18
-
19
- try:
20
- from ..models import (
21
- PythonAction,
22
- PythonEnvConfig,
23
- PythonObservation,
24
- ReviewFinding,
25
- TaskDescriptor,
26
- TaskEvaluation,
27
- )
28
- except ImportError:
29
- from models import ( # type: ignore
30
- PythonAction,
31
- PythonEnvConfig,
32
- PythonObservation,
33
- ReviewFinding,
34
- TaskDescriptor,
35
- TaskEvaluation,
36
- )
37
-
38
-
39
- @dataclass(frozen=True)
40
- class ReferenceFinding:
41
- """Hidden finding metadata used for deterministic grading."""
42
-
43
- rule_id: str
44
- title: str
45
- line: int
46
- category: str
47
- severity: str
48
- rationale: str
49
- recommendation: str
50
- weight: float
51
-
52
-
53
- @dataclass(frozen=True)
54
- class ReviewTask:
55
- """A visible task plus its hidden grading references."""
56
-
57
- descriptor: TaskDescriptor
58
- references: tuple[ReferenceFinding, ...]
59
- hint: str
60
- patched_code: Optional[str] = None
61
-
62
-
63
- TASK_BANK: Dict[str, ReviewTask] = {
64
- "py-review-easy": ReviewTask(
65
- descriptor=TaskDescriptor(
66
- task_id="py-review-easy",
67
- difficulty="easy",
68
- title="Mutable default argument",
69
- objective="Find the correctness issue and explain a safe fix.",
70
- code=(
71
- "def add_tag(tag, tags=[]):\n"
72
- " tags.append(tag)\n"
73
- " return tags\n"
74
- ),
75
- max_steps=4,
76
- success_threshold=0.7,
77
- ),
78
- references=(
79
- ReferenceFinding(
80
- rule_id="mutable-default",
81
- title="Mutable default list is shared across calls",
82
- line=1,
83
- category="bug",
84
- severity="warning",
85
- rationale="The list persists between calls and leaks state.",
86
- recommendation="Use None as the default and create a new list inside the function.",
87
- weight=1.0,
88
- ),
89
- ),
90
- hint="Look for state that survives between separate function calls.",
91
- patched_code=(
92
- "def add_tag(tag, tags=None):\n"
93
- " if tags is None:\n"
94
- " tags = []\n"
95
- " tags.append(tag)\n"
96
- " return tags\n"
97
- ),
98
- ),
99
- "py-review-medium": ReviewTask(
100
- descriptor=TaskDescriptor(
101
- task_id="py-review-medium",
102
- difficulty="medium",
103
- title="Unsafe shell invocation",
104
- objective="Review the snippet for security-sensitive behavior.",
105
- code=(
106
- "import os\n\n"
107
- "def run_backup(path):\n"
108
- " os.system(f\"tar -czf backup.tgz {path}\")\n"
109
- ),
110
- max_steps=4,
111
- success_threshold=0.72,
112
- ),
113
- references=(
114
- ReferenceFinding(
115
- rule_id="shell-injection",
116
- title="User input is interpolated into a shell command",
117
- line=4,
118
- category="security",
119
- severity="critical",
120
- rationale="An attacker can inject shell metacharacters through the path argument.",
121
- recommendation="Use subprocess with an argument list instead of os.system.",
122
- weight=1.0,
123
- ),
124
- ),
125
- hint="Check how external commands are invoked and whether user input is escaped.",
126
- patched_code=(
127
- "import subprocess\n\n"
128
- "def run_backup(path):\n"
129
- " subprocess.run([\"tar\", \"-czf\", \"backup.tgz\", path], check=True)\n"
130
- ),
131
- ),
132
- "py-review-hard": ReviewTask(
133
- descriptor=TaskDescriptor(
134
- task_id="py-review-hard",
135
- difficulty="hard",
136
- title="Retry helper hides failures",
137
- objective="Identify correctness and maintainability issues in the retry logic.",
138
- code=(
139
- "import time\n\n"
140
- "def fetch_with_retry(client, url, retries=3):\n"
141
- " last_error = None\n"
142
- " for _ in range(retries):\n"
143
- " try:\n"
144
- " return client.get(url, timeout=1)\n"
145
- " except Exception as exc:\n"
146
- " last_error = exc\n"
147
- " time.sleep(0.1)\n"
148
- " return None\n"
149
- ),
150
- max_steps=4,
151
- success_threshold=0.74,
152
- ),
153
- references=(
154
- ReferenceFinding(
155
- rule_id="swallowed-error",
156
- title="Function swallows the final exception and returns None",
157
- line=10,
158
- category="bug",
159
- severity="warning",
160
- rationale="Callers cannot distinguish a failed request from a valid None result.",
161
- recommendation="Re-raise the last exception after retries are exhausted.",
162
- weight=0.65,
163
- ),
164
- ReferenceFinding(
165
- rule_id="broad-except",
166
- title="Broad exception handler catches unexpected failures",
167
- line=7,
168
- category="maintainability",
169
- severity="info",
170
- rationale="Catching Exception masks programming errors and interrupts.",
171
- recommendation="Catch only the client or network exceptions you expect to retry.",
172
- weight=0.35,
173
- ),
174
- ),
175
- hint="Consider what happens to the final error after the retry loop finishes.",
176
- patched_code=(
177
- "import time\n\n"
178
- "def fetch_with_retry(client, url, retries=3):\n"
179
- " last_error = None\n"
180
- " for _ in range(retries):\n"
181
- " try:\n"
182
- " return client.get(url, timeout=1)\n"
183
- " except client.retryable_exceptions as exc:\n"
184
- " last_error = exc\n"
185
- " time.sleep(0.1)\n"
186
- " if last_error is not None:\n"
187
- " raise last_error\n"
188
- ),
189
- ),
190
- }
191
-
192
-
193
- def _utc_now() -> str:
194
- return datetime.now(UTC).isoformat()
195
-
196
-
197
- def _normalize_text(value: Optional[str]) -> str:
198
- return " ".join((value or "").strip().lower().split())
199
-
200
-
201
- def _normalize_code(value: Optional[str]) -> str:
202
- return "\n".join(line.rstrip() for line in (value or "").strip().splitlines())
203
-
204
-
205
- class PythonEnvironment(Environment[PythonAction, PythonObservation, State]):
206
- """Deterministic benchmark environment for Python code review tasks."""
207
-
208
- SUPPORTS_CONCURRENT_SESSIONS: bool = True
209
-
210
- def __init__(self, config: Optional[PythonEnvConfig] = None):
211
- super().__init__()
212
- self._config = config or PythonEnvConfig()
213
- self._state = State(episode_id=str(uuid4()), step_count=0)
214
- self._task_cursor = -1
215
- self._current_task: Optional[ReviewTask] = None
216
- self._submitted_findings: List[ReviewFinding] = []
217
- self._hints_used = 0
218
- self._created_at = _utc_now()
219
-
220
- def reset(
221
- self,
222
- seed: Optional[int] = None,
223
- episode_id: Optional[str] = None,
224
- **kwargs,
225
- ) -> PythonObservation:
226
- """Start the next configured review task."""
227
-
228
- del seed, kwargs
229
- self._task_cursor = (self._task_cursor + 1) % len(self._config.task_order)
230
- task_id = self._config.task_order[self._task_cursor]
231
- self._current_task = TASK_BANK.get(task_id, TASK_BANK["py-review-easy"])
232
- self._state = State(
233
- episode_id=episode_id or str(uuid4()),
234
- step_count=0,
235
- )
236
- self._submitted_findings = []
237
- self._hints_used = 0
238
- self._created_at = _utc_now()
239
- return self._build_observation(
240
- feedback="New review task loaded. Submit findings or request a hint.",
241
- reward=0.0,
242
- done=False,
243
- )
244
-
245
- def step(
246
- self,
247
- action: PythonAction,
248
- timeout_s: Optional[float] = None,
249
- **kwargs,
250
- ) -> PythonObservation:
251
- """Process one review action and return updated feedback."""
252
-
253
- del timeout_s, kwargs
254
- if self._current_task is None:
255
- return self.reset()
256
-
257
- self._state.step_count += 1
258
- operation = action.operation
259
- feedback = ""
260
- reward = 0.0
261
- done = False
262
-
263
- if operation == "request_hint":
264
- self._hints_used += 1
265
- feedback = self._current_task.hint
266
- evaluation = self._evaluate(self._submitted_findings, action.patched_code)
267
- reward = evaluation.score
268
- else:
269
- if action.findings:
270
- self._submitted_findings.extend(action.findings)
271
- evaluation = self._evaluate(self._submitted_findings, action.patched_code)
272
- reward = evaluation.score
273
- if operation == "finalize":
274
- done = True
275
- feedback = (
276
- "Review finalized. "
277
- f"Matched {evaluation.matched_findings}/{evaluation.total_findings} "
278
- "reference findings."
279
- )
280
- else:
281
- feedback = (
282
- f"Progress saved. Matched {evaluation.matched_findings}/"
283
- f"{evaluation.total_findings} findings with score {evaluation.score:.2f}."
284
- )
285
-
286
- if self._state.step_count >= self._max_steps():
287
- done = True
288
- if operation != "finalize":
289
- feedback = (
290
- f"{feedback} Maximum steps reached."
291
- if feedback
292
- else "Maximum steps reached."
293
- )
294
-
295
- return self._build_observation(
296
- feedback=feedback,
297
- reward=reward,
298
- done=done,
299
- patched_code=action.patched_code,
300
- )
301
-
302
- def _build_observation(
303
- self,
304
- *,
305
- feedback: str,
306
- reward: float,
307
- done: bool,
308
- patched_code: Optional[str] = None,
309
- ) -> PythonObservation:
310
- assert self._current_task is not None
311
- evaluation = self._evaluate(self._submitted_findings, patched_code)
312
- attempts_remaining = max(
313
- self._max_steps() - self._state.step_count,
314
- 0,
315
- )
316
- return PythonObservation(
317
- task=self._current_task.descriptor,
318
- feedback=feedback,
319
- submitted_findings=list(self._submitted_findings),
320
- hints_used=self._hints_used,
321
- attempts_remaining=attempts_remaining,
322
- evaluation=evaluation,
323
- score=evaluation.score,
324
- review_time_ms=float(self._state.step_count * 125),
325
- done=done,
326
- reward=reward,
327
- metadata={
328
- "episode_id": self._state.episode_id,
329
- "created_at": self._created_at,
330
- "updated_at": _utc_now(),
331
- },
332
- )
333
-
334
- def _evaluate(
335
- self,
336
- findings: Iterable[ReviewFinding],
337
- patched_code: Optional[str],
338
- ) -> TaskEvaluation:
339
- assert self._current_task is not None
340
-
341
- references = self._current_task.references
342
- matched_reference_ids: List[str] = []
343
- matched_weight = 0.0
344
- false_positives = 0
345
- duplicate_findings = 0
346
-
347
- seen_ids = set()
348
- for finding in findings:
349
- ref_id = self._match_reference(finding, references)
350
- if ref_id is None:
351
- false_positives += 1
352
- continue
353
- if ref_id in seen_ids:
354
- duplicate_findings += 1
355
- continue
356
- seen_ids.add(ref_id)
357
- matched_reference_ids.append(ref_id)
358
- matched_weight += next(ref.weight for ref in references if ref.rule_id == ref_id)
359
-
360
- total_weight = sum(ref.weight for ref in references) or 1.0
361
- weighted_recall = min(matched_weight / total_weight, 1.0)
362
-
363
- patch_score = 0.0
364
- if self._current_task.patched_code and patched_code:
365
- patch_score = float(
366
- _normalize_code(patched_code) == _normalize_code(self._current_task.patched_code)
367
- )
368
-
369
- raw_score = (
370
- weighted_recall
371
- + (self._config.patch_bonus_multiplier * patch_score)
372
- - (self._config.false_positive_penalty * false_positives)
373
- - (self._config.duplicate_penalty * duplicate_findings)
374
- - (self._config.hint_penalty * self._hints_used)
375
- )
376
- score = max(0.0, min(raw_score, 1.0))
377
-
378
- return TaskEvaluation(
379
- matched_reference_ids=matched_reference_ids,
380
- matched_findings=len(matched_reference_ids),
381
- total_findings=len(references),
382
- false_positives=false_positives,
383
- duplicate_findings=duplicate_findings,
384
- weighted_recall=weighted_recall,
385
- patch_score=patch_score,
386
- score=score,
387
- passed=score >= self._current_task.descriptor.success_threshold,
388
- )
389
-
390
- def _match_reference(
391
- self,
392
- finding: ReviewFinding,
393
- references: Iterable[ReferenceFinding],
394
- ) -> Optional[str]:
395
- finding_rule = _normalize_text(finding.rule_id)
396
- finding_title = _normalize_text(finding.title)
397
- for reference in references:
398
- if finding_rule and finding_rule == _normalize_text(reference.rule_id):
399
- return reference.rule_id
400
- line_matches = finding.line is not None and finding.line == reference.line
401
- category_matches = finding.category == reference.category
402
- title_matches = finding_title and (
403
- finding_title in _normalize_text(reference.title)
404
- or _normalize_text(reference.title) in finding_title
405
- )
406
- if line_matches and (category_matches or title_matches):
407
- return reference.rule_id
408
- return None
409
-
410
- def _max_steps(self) -> int:
411
- assert self._current_task is not None
412
- return min(
413
- self._current_task.descriptor.max_steps,
414
- self._config.max_steps_per_task,
415
- )
416
-
417
- @property
418
- def state(self) -> State:
419
- """Return the current environment state."""
420
-
421
- return self._state
 
1
+ """Compatibility shim for older imports."""
2
+
3
+ try:
4
+ from server.code_review_environment import PythonEnvironment
5
+ except ModuleNotFoundError: # pragma: no cover
6
+ from .code_review_environment import PythonEnvironment
7
+
8
+
9
+ __all__ = ["PythonEnvironment"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
server/requirements.txt CHANGED
@@ -1,6 +1,6 @@
1
- openenv[core]>=0.2.0
2
- fastapi>=0.115.0
3
- uvicorn>=0.24.0
4
-
5
-
6
-
 
1
+ openenv-core[core]>=0.2.2
2
+ fastapi>=0.115.0
3
+ uvicorn[standard]>=0.30.0
4
+ openai>=1.40.0
5
+ pytest>=8.0.0
6
+ pydantic>=2.0.0
server/static_review.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Deterministic static-review helpers for arbitrary Python code.
2
+
3
+ Unlike the benchmark grader, this module does not compare against hidden rubric
4
+ items. Instead, it performs direct AST-based review on arbitrary snippets so it
5
+ can be used for manual testing, examples, and future dataset generation.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import ast
11
+ from typing import List, Optional
12
+
13
+ try:
14
+ from models import DirectReviewResponse, ReviewFinding
15
+ except ModuleNotFoundError: # pragma: no cover
16
+ from ..models import DirectReviewResponse, ReviewFinding
17
+
18
+
19
+ class _StaticAnalyzer(ast.NodeVisitor):
20
+ """AST visitor that emits structured review findings.
21
+
22
+ The visitor intentionally focuses on a small set of high-signal patterns so
23
+ the direct-review endpoint stays predictable and easy to understand.
24
+ """
25
+
26
+ def __init__(self) -> None:
27
+ self.issues: List[ReviewFinding] = []
28
+
29
+ def visit_FunctionDef(self, node: ast.FunctionDef) -> None: # noqa: N802
30
+ """Flag mutable default arguments in function definitions."""
31
+
32
+ for default in list(node.args.defaults):
33
+ if isinstance(default, (ast.List, ast.Dict, ast.Set)):
34
+ self.issues.append(
35
+ ReviewFinding(
36
+ title="Mutable default argument",
37
+ line=getattr(default, "lineno", node.lineno),
38
+ category="bug",
39
+ severity="warning",
40
+ rationale=(
41
+ "Mutable defaults persist across calls and can leak state "
42
+ "between unrelated requests."
43
+ ),
44
+ recommendation="Use None as the default and create the object inside the function.",
45
+ rule_id="mutable-default-list",
46
+ )
47
+ )
48
+ self.generic_visit(node)
49
+
50
+ def visit_Call(self, node: ast.Call) -> None: # noqa: N802
51
+ """Inspect function calls for obviously unsafe or noisy patterns."""
52
+
53
+ func_name = self._call_name(node)
54
+ if func_name in {"eval", "exec"}:
55
+ self.issues.append(
56
+ ReviewFinding(
57
+ title=f"Avoid {func_name} on untrusted input",
58
+ line=node.lineno,
59
+ category="security",
60
+ severity="critical",
61
+ rationale=(
62
+ f"{func_name} executes arbitrary code and is unsafe on "
63
+ "user-controlled input."
64
+ ),
65
+ recommendation="Use a safe parser or a whitelist-based evaluator.",
66
+ rule_id="avoid-eval" if func_name == "eval" else "avoid-exec",
67
+ )
68
+ )
69
+ if func_name.endswith("check_output") or func_name.endswith("run"):
70
+ for keyword in node.keywords:
71
+ # `shell=True` is only a problem when the command comes from a
72
+ # shell-parsed string, but this heuristic is high value for
73
+ # review and intentionally conservative.
74
+ if keyword.arg == "shell" and isinstance(keyword.value, ast.Constant) and keyword.value.value is True:
75
+ self.issues.append(
76
+ ReviewFinding(
77
+ title="shell=True with dynamic input",
78
+ line=node.lineno,
79
+ category="security",
80
+ severity="critical",
81
+ rationale=(
82
+ "shell=True executes through the shell and can allow "
83
+ "command injection when the command string is interpolated."
84
+ ),
85
+ recommendation="Pass a list of arguments and keep shell=False.",
86
+ rule_id="shell-true-command-injection",
87
+ )
88
+ )
89
+ if func_name == "print":
90
+ self.issues.append(
91
+ ReviewFinding(
92
+ title="Print statement in application logic",
93
+ line=node.lineno,
94
+ category="style",
95
+ severity="info",
96
+ rationale="Production services should prefer structured logging over print statements.",
97
+ recommendation="Use the logging module or return the value to the caller.",
98
+ rule_id="print-statement",
99
+ )
100
+ )
101
+ self.generic_visit(node)
102
+
103
+ def visit_ExceptHandler(self, node: ast.ExceptHandler) -> None: # noqa: N802
104
+ """Flag bare exception handlers that hide failures."""
105
+
106
+ if node.type is None:
107
+ self.issues.append(
108
+ ReviewFinding(
109
+ title="Bare except",
110
+ line=node.lineno,
111
+ category="maintainability",
112
+ severity="warning",
113
+ rationale="Bare except catches KeyboardInterrupt and other system-level exceptions.",
114
+ recommendation="Catch a specific exception and record the failure.",
115
+ rule_id="bare-except",
116
+ )
117
+ )
118
+ self.generic_visit(node)
119
+
120
+ def visit_For(self, node: ast.For) -> None: # noqa: N802
121
+ """Look for list-membership checks nested in loops."""
122
+
123
+ for child in ast.walk(node):
124
+ if isinstance(child, ast.Compare) and any(
125
+ isinstance(operator, (ast.In, ast.NotIn)) for operator in child.ops
126
+ ):
127
+ if isinstance(child.comparators[0], ast.Name):
128
+ self.issues.append(
129
+ ReviewFinding(
130
+ title="Potential quadratic membership check inside loop",
131
+ line=child.lineno,
132
+ category="performance",
133
+ severity="warning",
134
+ rationale=(
135
+ "Repeated membership checks against a list inside a loop "
136
+ "can degrade to quadratic runtime."
137
+ ),
138
+ recommendation="Use a set or dict for O(1) membership checks.",
139
+ rule_id="quadratic-membership-check",
140
+ )
141
+ )
142
+ break
143
+ self.generic_visit(node)
144
+
145
+ @staticmethod
146
+ def _call_name(node: ast.Call) -> str:
147
+ """Extract a dotted function name such as `subprocess.run`."""
148
+
149
+ func = node.func
150
+ if isinstance(func, ast.Name):
151
+ return func.id
152
+ if isinstance(func, ast.Attribute):
153
+ prefix = _StaticAnalyzer._attribute_prefix(func.value)
154
+ return f"{prefix}.{func.attr}" if prefix else func.attr
155
+ return ""
156
+
157
+ @staticmethod
158
+ def _attribute_prefix(node: ast.AST) -> str:
159
+ """Reconstruct the left-hand side of an attribute chain."""
160
+
161
+ if isinstance(node, ast.Name):
162
+ return node.id
163
+ if isinstance(node, ast.Attribute):
164
+ prefix = _StaticAnalyzer._attribute_prefix(node.value)
165
+ return f"{prefix}.{node.attr}" if prefix else node.attr
166
+ return ""
167
+
168
+
169
+ def analyze_python_code(code: str) -> List[ReviewFinding]:
170
+ """Analyze arbitrary Python code and return structured findings."""
171
+
172
+ if not code.strip():
173
+ return [
174
+ ReviewFinding(
175
+ title="No code provided",
176
+ category="bug",
177
+ severity="warning",
178
+ rationale="The reviewer cannot inspect an empty submission.",
179
+ recommendation="Provide Python source code.",
180
+ rule_id="empty-input",
181
+ )
182
+ ]
183
+
184
+ # Syntax errors are turned into findings rather than exceptions so API
185
+ # consumers always get a valid response shape.
186
+ try:
187
+ tree = ast.parse(code)
188
+ except SyntaxError as exc:
189
+ return [
190
+ ReviewFinding(
191
+ title="Syntax error",
192
+ line=exc.lineno,
193
+ category="bug",
194
+ severity="critical",
195
+ rationale=exc.msg,
196
+ recommendation="Fix the syntax error before running static review.",
197
+ rule_id="syntax-error",
198
+ )
199
+ ]
200
+
201
+ analyzer = _StaticAnalyzer()
202
+ analyzer.visit(tree)
203
+ return _deduplicate(analyzer.issues)
204
+
205
+
206
+ def build_direct_review_response(
207
+ code: str, context: Optional[str] = None
208
+ ) -> DirectReviewResponse:
209
+ """Build the public direct-review response for the `/review` route."""
210
+
211
+ issues = analyze_python_code(code)
212
+ weighted_penalty = 0.0
213
+ # The direct-review score is intentionally simple: more severe issues lower
214
+ # the score more aggressively.
215
+ for issue in issues:
216
+ if issue.severity == "critical":
217
+ weighted_penalty += 0.3
218
+ elif issue.severity == "warning":
219
+ weighted_penalty += 0.15
220
+ else:
221
+ weighted_penalty += 0.05
222
+
223
+ score = max(0.0, min(1.0, 1.0 - weighted_penalty))
224
+ summary = _build_summary(issues, context)
225
+ improved_code = _suggest_improved_code(code, issues)
226
+ return DirectReviewResponse(
227
+ issues=issues,
228
+ summary=summary,
229
+ score=score,
230
+ improved_code=improved_code,
231
+ )
232
+
233
+
234
+ def _build_summary(issues: List[ReviewFinding], context: Optional[str]) -> str:
235
+ """Create a concise human-readable summary for the direct-review response."""
236
+
237
+ if not issues:
238
+ base = "No obvious issues were detected by the deterministic reviewer."
239
+ else:
240
+ critical = sum(1 for issue in issues if issue.severity == "critical")
241
+ warnings = sum(1 for issue in issues if issue.severity == "warning")
242
+ infos = sum(1 for issue in issues if issue.severity == "info")
243
+ base = (
244
+ f"Detected {len(issues)} issue(s): {critical} critical, "
245
+ f"{warnings} warning, {infos} info."
246
+ )
247
+ if context:
248
+ return f"{base} Context: {context}"
249
+ return base
250
+
251
+
252
+ def _suggest_improved_code(code: str, issues: List[ReviewFinding]) -> Optional[str]:
253
+ """Append high-level fix directions to the submitted code."""
254
+
255
+ if not issues:
256
+ return None
257
+ suggestions = [issue.recommendation for issue in issues if issue.recommendation]
258
+ comment = " | ".join(dict.fromkeys(suggestions))
259
+ return f"{code.rstrip()}\n\n# Suggested review directions: {comment}"
260
+
261
+
262
+ def _deduplicate(findings: List[ReviewFinding]) -> List[ReviewFinding]:
263
+ """Drop duplicate findings that refer to the same rule and line."""
264
+
265
+ seen = set()
266
+ unique: List[ReviewFinding] = []
267
+ for finding in findings:
268
+ key = (finding.rule_id, finding.line, finding.category)
269
+ if key in seen:
270
+ continue
271
+ seen.add(key)
272
+ unique.append(finding)
273
+ return unique
server/task_bank.py ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Static PR-review tasks and hidden grading rubrics."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Dict, Iterable, List, Sequence
7
+
8
+ try:
9
+ from models import Category, Difficulty, Severity, TaskDescriptor, TaskSummary
10
+ except ModuleNotFoundError: # pragma: no cover
11
+ from ..models import Category, Difficulty, Severity, TaskDescriptor, TaskSummary
12
+
13
+
14
+ @dataclass(frozen=True)
15
+ class RubricIssue:
16
+ """One hidden issue that can be matched by the deterministic grader."""
17
+
18
+ issue_id: str
19
+ file_path: str
20
+ line: int
21
+ category: Category
22
+ severity: Severity
23
+ keywords: Sequence[str]
24
+ min_keyword_hits: int
25
+ weight: float
26
+
27
+
28
+ @dataclass(frozen=True)
29
+ class TaskSpec:
30
+ """Complete task definition, including hidden rubric metadata."""
31
+
32
+ task_id: str
33
+ difficulty: Difficulty
34
+ title: str
35
+ goal: str
36
+ repo_summary: str
37
+ visible_diff: str
38
+ file_contents: Dict[str, str]
39
+ changed_files: Sequence[str]
40
+ rubric_issues: Sequence[RubricIssue]
41
+ max_steps: int
42
+
43
+ @property
44
+ def available_files(self) -> List[str]:
45
+ return list(self.file_contents.keys())
46
+
47
+ def to_descriptor(self) -> TaskDescriptor:
48
+ return TaskDescriptor(
49
+ task_id=self.task_id,
50
+ difficulty=self.difficulty,
51
+ title=self.title,
52
+ goal=self.goal,
53
+ repo_summary=self.repo_summary,
54
+ changed_files=list(self.changed_files),
55
+ available_files=self.available_files,
56
+ max_steps=self.max_steps,
57
+ )
58
+
59
+ def to_summary(self) -> TaskSummary:
60
+ return TaskSummary(
61
+ task_id=self.task_id,
62
+ difficulty=self.difficulty,
63
+ title=self.title,
64
+ goal=self.goal,
65
+ )
66
+
67
+
68
+ TASKS: List[TaskSpec] = [
69
+ TaskSpec(
70
+ task_id="py-pr-review-easy",
71
+ difficulty="easy",
72
+ title="Retry Delay Regression",
73
+ goal=(
74
+ "Review the pull request and identify the real bug introduced in the retry "
75
+ "delay helper before it ships."
76
+ ),
77
+ repo_summary=(
78
+ "This service computes retry delays for background notification delivery. "
79
+ "The change is intended to relax validation for legacy callers."
80
+ ),
81
+ visible_diff="\n".join(
82
+ [
83
+ "diff --git a/src/notifications/retry.py b/src/notifications/retry.py",
84
+ "@@",
85
+ "- if base_delay <= 0:",
86
+ "+ if base_delay < 0:",
87
+ " return 0.0",
88
+ ]
89
+ ),
90
+ file_contents={
91
+ "src/notifications/retry.py": "\n".join(
92
+ [
93
+ "from __future__ import annotations",
94
+ "",
95
+ "def calculate_retry_delay(attempt: int, base_delay: float = 2.0) -> float:",
96
+ ' """Return the retry delay in seconds."""',
97
+ " if attempt < 0:",
98
+ ' raise ValueError(\"attempt must be >= 0\")',
99
+ " if base_delay < 0:",
100
+ " return 0.0",
101
+ " return attempt / base_delay",
102
+ ]
103
+ )
104
+ },
105
+ changed_files=("src/notifications/retry.py",),
106
+ rubric_issues=(
107
+ RubricIssue(
108
+ issue_id="zero-base-delay-divides",
109
+ file_path="src/notifications/retry.py",
110
+ line=7,
111
+ category="bug",
112
+ severity="warning",
113
+ keywords=("zero", "division", "base_delay"),
114
+ min_keyword_hits=2,
115
+ weight=1.0,
116
+ ),
117
+ ),
118
+ max_steps=4,
119
+ ),
120
+ TaskSpec(
121
+ task_id="py-pr-review-medium",
122
+ difficulty="medium",
123
+ title="Coupon Billing Rollout",
124
+ goal=(
125
+ "Review the billing change and identify both the production regression and "
126
+ "the missing coverage that would have caught it."
127
+ ),
128
+ repo_summary=(
129
+ "The billing service is adding coupon support for one-off invoices. The PR "
130
+ "touches both the service code and its unit tests."
131
+ ),
132
+ visible_diff="\n".join(
133
+ [
134
+ "diff --git a/app/billing/invoice_service.py b/app/billing/invoice_service.py",
135
+ "@@",
136
+ " def charge_invoice(order: dict, gateway: Gateway) -> str:",
137
+ "- return gateway.charge(order[\"customer_id\"], order[\"amount_cents\"])",
138
+ "+ total = order[\"amount_cents\"]",
139
+ "+ coupon = order.get(\"coupon_code\")",
140
+ "+ if coupon:",
141
+ "+ discount = gateway.lookup_discount(coupon)",
142
+ "+ total = max(total - discount, 0)",
143
+ "+ return gateway.charge(order[\"customer_id\"], order[\"amount_cents\"])",
144
+ "",
145
+ "diff --git a/tests/test_invoice_service.py b/tests/test_invoice_service.py",
146
+ "@@",
147
+ " class FakeGateway:",
148
+ "+ def lookup_discount(self, coupon: str) -> int:",
149
+ "+ return 250",
150
+ ]
151
+ ),
152
+ file_contents={
153
+ "app/billing/invoice_service.py": "\n".join(
154
+ [
155
+ "from gateway import Gateway",
156
+ "",
157
+ "def charge_invoice(order: dict, gateway: Gateway) -> str:",
158
+ ' total = order["amount_cents"]',
159
+ ' coupon = order.get("coupon_code")',
160
+ " if coupon:",
161
+ " discount = gateway.lookup_discount(coupon)",
162
+ " total = max(total - discount, 0)",
163
+ ' return gateway.charge(order["customer_id"], order["amount_cents"])',
164
+ ]
165
+ ),
166
+ "tests/test_invoice_service.py": "\n".join(
167
+ [
168
+ "from app.billing.invoice_service import charge_invoice",
169
+ "",
170
+ "class FakeGateway:",
171
+ " def lookup_discount(self, coupon: str) -> int:",
172
+ " return 250",
173
+ "",
174
+ " def charge(self, customer_id: str, amount_cents: int) -> str:",
175
+ " self.last_charge = (customer_id, amount_cents)",
176
+ ' return "charge_123"',
177
+ "",
178
+ "def test_charge_invoice_without_coupon():",
179
+ " gateway = FakeGateway()",
180
+ ' charge_invoice({"customer_id": "cus_1", "amount_cents": 1000}, gateway)',
181
+ ' assert gateway.last_charge == ("cus_1", 1000)',
182
+ ]
183
+ ),
184
+ },
185
+ changed_files=("app/billing/invoice_service.py", "tests/test_invoice_service.py"),
186
+ rubric_issues=(
187
+ RubricIssue(
188
+ issue_id="discount-total-unused",
189
+ file_path="app/billing/invoice_service.py",
190
+ line=8,
191
+ category="bug",
192
+ severity="warning",
193
+ keywords=("discount", "total", "charge", "amount"),
194
+ min_keyword_hits=2,
195
+ weight=0.6,
196
+ ),
197
+ RubricIssue(
198
+ issue_id="missing-coupon-test",
199
+ file_path="tests/test_invoice_service.py",
200
+ line=11,
201
+ category="testing",
202
+ severity="warning",
203
+ keywords=("missing", "test", "coupon", "discount"),
204
+ min_keyword_hits=2,
205
+ weight=0.4,
206
+ ),
207
+ ),
208
+ max_steps=5,
209
+ ),
210
+ TaskSpec(
211
+ task_id="py-pr-review-hard",
212
+ difficulty="hard",
213
+ title="Async Job Runner Deduplication",
214
+ goal=(
215
+ "Review the async job-runner PR and find the subtle concurrency issues "
216
+ "without inventing extra problems."
217
+ ),
218
+ repo_summary=(
219
+ "A shared webhook backfill service is deduplicating in-flight work with an "
220
+ "async task cache and writing the latest result for operators to inspect."
221
+ ),
222
+ visible_diff="\n".join(
223
+ [
224
+ "diff --git a/app/jobs/runner.py b/app/jobs/runner.py",
225
+ "@@",
226
+ " async def run_job(job_id: str, payload: dict, worker) -> str:",
227
+ " if job_id in ACTIVE_RUNS:",
228
+ " return await ACTIVE_RUNS[job_id]",
229
+ "+ lock = asyncio.Lock()",
230
+ "+ async with lock:",
231
+ "+ task = asyncio.create_task(worker.run(payload))",
232
+ "+ ACTIVE_RUNS[job_id] = task",
233
+ " try:",
234
+ " result = await task",
235
+ " finally:",
236
+ " ACTIVE_RUNS.pop(job_id, None)",
237
+ "+ Path(\"latest-result.json\").write_text(result)",
238
+ " return result",
239
+ ]
240
+ ),
241
+ file_contents={
242
+ "app/jobs/runner.py": "\n".join(
243
+ [
244
+ "import asyncio",
245
+ "from pathlib import Path",
246
+ "",
247
+ "ACTIVE_RUNS: dict[str, asyncio.Task[str]] = {}",
248
+ "",
249
+ "async def run_job(job_id: str, payload: dict, worker) -> str:",
250
+ " if job_id in ACTIVE_RUNS:",
251
+ " return await ACTIVE_RUNS[job_id]",
252
+ "",
253
+ " lock = asyncio.Lock()",
254
+ " async with lock:",
255
+ " task = asyncio.create_task(worker.run(payload))",
256
+ " ACTIVE_RUNS[job_id] = task",
257
+ " try:",
258
+ " result = await task",
259
+ " finally:",
260
+ " ACTIVE_RUNS.pop(job_id, None)",
261
+ "",
262
+ ' Path("latest-result.json").write_text(result)',
263
+ " return result",
264
+ ]
265
+ ),
266
+ "tests/test_runner.py": "\n".join(
267
+ [
268
+ "import pytest",
269
+ "",
270
+ "from app.jobs.runner import run_job",
271
+ "",
272
+ "class FakeWorker:",
273
+ " async def run(self, payload: dict) -> str:",
274
+ ' return payload["job_id"]',
275
+ "",
276
+ "@pytest.mark.asyncio",
277
+ "async def test_run_job_returns_worker_result():",
278
+ " worker = FakeWorker()",
279
+ ' result = await run_job("job-1", {"job_id": "job-1"}, worker)',
280
+ ' assert result == "job-1"',
281
+ ]
282
+ ),
283
+ },
284
+ changed_files=("app/jobs/runner.py", "tests/test_runner.py"),
285
+ rubric_issues=(
286
+ RubricIssue(
287
+ issue_id="per-call-lock-race",
288
+ file_path="app/jobs/runner.py",
289
+ line=9,
290
+ category="bug",
291
+ severity="warning",
292
+ keywords=("lock", "race", "concurrent", "duplicate"),
293
+ min_keyword_hits=2,
294
+ weight=0.55,
295
+ ),
296
+ RubricIssue(
297
+ issue_id="shared-output-file-race",
298
+ file_path="app/jobs/runner.py",
299
+ line=18,
300
+ category="maintainability",
301
+ severity="warning",
302
+ keywords=("latest", "result", "file", "concurrent", "overwrite"),
303
+ min_keyword_hits=2,
304
+ weight=0.45,
305
+ ),
306
+ ),
307
+ max_steps=6,
308
+ ),
309
+ ]
310
+
311
+
312
+ TASKS_BY_ID: Dict[str, TaskSpec] = {task.task_id: task for task in TASKS}
313
+
314
+
315
+ def list_task_descriptors() -> List[TaskDescriptor]:
316
+ """Return public descriptors for all tasks."""
317
+
318
+ return [task.to_descriptor() for task in TASKS]
319
+
320
+
321
+ def list_task_summaries() -> List[TaskSummary]:
322
+ """Return task summaries for lightweight route responses."""
323
+
324
+ return [task.to_summary() for task in TASKS]
325
+
326
+
327
+ def get_task(task_id: str) -> TaskSpec:
328
+ """Return a task by id."""
329
+
330
+ try:
331
+ return TASKS_BY_ID[task_id]
332
+ except KeyError as exc: # pragma: no cover
333
+ raise ValueError(f"Unknown task_id: {task_id}") from exc
334
+
335
+
336
+ def task_ids() -> Iterable[str]:
337
+ """Return task ids in benchmark order."""
338
+
339
+ return [task.task_id for task in TASKS]
340
+
summary/01_introduction_quickstart.md ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 01. Introduction & Quick Start
2
+
3
+ Source:
4
+ - https://meta-pytorch.org/OpenEnv/auto_getting_started/plot_01_introduction_quickstart.html
5
+
6
+ ## Main idea
7
+
8
+ OpenEnv is a standardized framework for building, sharing, and using RL environments as typed, containerized services.
9
+
10
+ The official docs frame it as:
11
+
12
+ - Gym-style interaction
13
+ - Docker-based isolation
14
+ - typed contracts
15
+ - HTTP/WebSocket access
16
+ - easy sharing through Hugging Face
17
+
18
+ ## Core loop
19
+
20
+ The RL interaction model is still the normal loop:
21
+
22
+ 1. reset environment
23
+ 2. observe state
24
+ 3. choose action
25
+ 4. call step
26
+ 5. receive reward + next observation
27
+ 6. repeat until done
28
+
29
+ The difference is that OpenEnv wraps this loop in a typed client/server system.
30
+
31
+ ## Why OpenEnv instead of only Gym
32
+
33
+ The docs emphasize these advantages:
34
+
35
+ - type safety
36
+ - environment isolation through containers
37
+ - better reproducibility
38
+ - easier sharing and deployment
39
+ - language-agnostic communication
40
+ - cleaner debugging
41
+
42
+ The key contrast is:
43
+
44
+ - old style: raw arrays and same-process execution
45
+ - OpenEnv style: typed objects and isolated environment runtime
46
+
47
+ ## Important mental model
48
+
49
+ OpenEnv treats environments more like services than in-process libraries.
50
+
51
+ That means:
52
+
53
+ - your environment logic can run separately from the agent code
54
+ - failures in the environment do not automatically crash the training loop
55
+ - deployment and usage are closer to how production systems work
56
+
57
+ ## What this means for `python_env`
58
+
59
+ Your repo should keep these properties intact:
60
+
61
+ - typed `Action`, `Observation`, and evaluation models
62
+ - a clean environment class with `reset()`, `step()`, and `state`
63
+ - a client that hides transport details
64
+ - a deployable container
65
+
66
+ For hackathon purposes, this page is the justification for why your project is not just a script. It is a reusable environment artifact.
summary/02_using_environments.md ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 02. Using Environments
2
+
3
+ Source:
4
+ - https://meta-pytorch.org/OpenEnv/auto_getting_started/plot_02_using_environments.html
5
+
6
+ ## Main idea
7
+
8
+ This page is about how users consume an existing OpenEnv environment.
9
+
10
+ The docs highlight three connection methods:
11
+
12
+ 1. from Hugging Face Hub
13
+ 2. from Docker image
14
+ 3. from direct base URL
15
+
16
+ ## Connection methods
17
+
18
+ ### 1. From Hugging Face Hub
19
+
20
+ The easiest route for end users.
21
+
22
+ Typical flow:
23
+
24
+ - pull the image from the HF registry
25
+ - start the container locally
26
+ - connect to it
27
+ - clean it up on close
28
+
29
+ The docs show the pattern conceptually as:
30
+
31
+ ```python
32
+ MyEnv.from_hub("owner/env-name")
33
+ ```
34
+
35
+ ## 2. From Docker image
36
+
37
+ Useful when:
38
+
39
+ - you already built the image locally
40
+ - you want reproducible local runs
41
+ - you do not want to depend on a live remote Space
42
+
43
+ Typical pattern:
44
+
45
+ ```python
46
+ MyEnv.from_docker_image("my-env:latest")
47
+ ```
48
+
49
+ ## 3. Direct URL connection
50
+
51
+ Useful when:
52
+
53
+ - the server is already running
54
+ - you want to connect to localhost or a deployed Space
55
+
56
+ Typical pattern:
57
+
58
+ ```python
59
+ MyEnv(base_url="http://localhost:8000")
60
+ ```
61
+
62
+ ## WebSocket model
63
+
64
+ The docs emphasize that OpenEnv uses WebSocket-backed sessions for persistent environment interaction.
65
+
66
+ Why this matters:
67
+
68
+ - lower overhead than stateless HTTP on every step
69
+ - cleaner session management
70
+ - better fit for multi-step RL loops
71
+
72
+ ## Environment loop
73
+
74
+ The intended use pattern is:
75
+
76
+ 1. connect
77
+ 2. reset
78
+ 3. repeatedly call `step(action)`
79
+ 4. inspect `reward`, `done`, and `observation`
80
+ 5. close cleanly
81
+
82
+ ## What this means for `python_env`
83
+
84
+ Your environment should be easy to consume in all three modes:
85
+
86
+ - local URL
87
+ - local Docker image
88
+ - HF Space
89
+
90
+ That means the most important user-facing checks are:
91
+
92
+ - `reset()` works
93
+ - `step()` works
94
+ - the client can parse the observation correctly
95
+ - Docker image starts cleanly
96
+ - deployed Space responds on `/health`, `/docs`, and session routes
97
+
98
+ For hackathon validation, this page is basically the “user experience” standard you need to match.