Spaces:
Sleeping
Sleeping
Commit Β·
ced8fd0
0
Parent(s):
comit
Browse files- Dockerfile +39 -0
- README.md +262 -0
- agents/__init__.py +1 -0
- agents/baseline_agent.py +271 -0
- env.py +211 -0
- graders/__init__.py +1 -0
- graders/grader.py +201 -0
- inference.py +275 -0
- models.py +92 -0
- openenv.yaml +99 -0
- requirements.txt +6 -0
- server.py +126 -0
- tasks/__init__.py +1 -0
- tasks/task1_easy.py +124 -0
- tasks/task2_medium.py +208 -0
- tasks/task3_hard.py +241 -0
- tests/test_env.py +162 -0
Dockerfile
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ββ Build stage βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 2 |
+
FROM python:3.11-slim AS builder
|
| 3 |
+
|
| 4 |
+
WORKDIR /app
|
| 5 |
+
|
| 6 |
+
# Install build dependencies
|
| 7 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 8 |
+
gcc \
|
| 9 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
+
|
| 11 |
+
COPY requirements.txt .
|
| 12 |
+
RUN pip install --no-cache-dir --prefix=/install -r requirements.txt
|
| 13 |
+
|
| 14 |
+
# ββ Runtime stage βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 15 |
+
FROM python:3.11-slim
|
| 16 |
+
|
| 17 |
+
# Hugging Face Spaces expects the app to run as a non-root user
|
| 18 |
+
RUN useradd -m -u 1000 appuser
|
| 19 |
+
|
| 20 |
+
WORKDIR /app
|
| 21 |
+
|
| 22 |
+
# Copy installed packages
|
| 23 |
+
COPY --from=builder /install /usr/local
|
| 24 |
+
|
| 25 |
+
# Copy application code
|
| 26 |
+
COPY --chown=appuser:appuser . .
|
| 27 |
+
|
| 28 |
+
# Ensure sub-packages are importable
|
| 29 |
+
RUN touch tasks/__init__.py graders/__init__.py agents/__init__.py
|
| 30 |
+
|
| 31 |
+
USER appuser
|
| 32 |
+
|
| 33 |
+
# HF Spaces expects port 7860
|
| 34 |
+
EXPOSE 7860
|
| 35 |
+
|
| 36 |
+
ENV PYTHONUNBUFFERED=1
|
| 37 |
+
ENV PYTHONPATH=/app
|
| 38 |
+
|
| 39 |
+
CMD ["python", "-m", "uvicorn", "server:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
ADDED
|
@@ -0,0 +1,262 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# π CodeReviewEnv
|
| 2 |
+
|
| 3 |
+
> An OpenEnv-compliant benchmark environment where AI agents act as senior engineers reviewing pull requests β catching bugs, finding security holes, and fixing broken code.
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
## Overview & Motivation
|
| 8 |
+
|
| 9 |
+
Code review is one of the highest-leverage activities in software engineering, yet it is time-consuming, inconsistent, and cognitively demanding. A model that can reliably triage pull requests, identify security vulnerabilities, and produce corrected patches would meaningfully accelerate software delivery.
|
| 10 |
+
|
| 11 |
+
**CodeReviewEnv** simulates exactly this. Three tasks of increasing difficulty present agents with realistic pull requests containing planted defects. The agent must reason over code, report issues with structured annotations, submit a corrected patch, and deliver a final verdict β all within a bounded step budget.
|
| 12 |
+
|
| 13 |
+
---
|
| 14 |
+
|
| 15 |
+
## Environment Architecture
|
| 16 |
+
|
| 17 |
+
```
|
| 18 |
+
code-review-env/
|
| 19 |
+
βββ env.py # Core OpenEnv environment (reset / step / state)
|
| 20 |
+
βββ server.py # FastAPI HTTP server exposing the OpenEnv interface
|
| 21 |
+
βββ models.py # Pydantic typed models: Action, Observation, Reward, State
|
| 22 |
+
βββ openenv.yaml # OpenEnv metadata
|
| 23 |
+
βββ tasks/
|
| 24 |
+
β βββ task1_easy.py # Bug hunt: simple Python utility
|
| 25 |
+
β βββ task2_medium.py # Security audit: Flask auth endpoint
|
| 26 |
+
β βββ task3_hard.py # Correctness: distributed LRU cache
|
| 27 |
+
βββ graders/
|
| 28 |
+
β βββ grader.py # Deterministic keyword + AST graders
|
| 29 |
+
βββ agents/
|
| 30 |
+
β βββ baseline_agent.py # HF Inference API baseline (OpenAI-compatible)
|
| 31 |
+
βββ Dockerfile
|
| 32 |
+
βββ requirements.txt
|
| 33 |
+
βββ README.md
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
---
|
| 37 |
+
|
| 38 |
+
## Action Space
|
| 39 |
+
|
| 40 |
+
Each agent turn is a single `ReviewAction` JSON object:
|
| 41 |
+
|
| 42 |
+
| Field | Type | Description |
|
| 43 |
+
|---|---|---|
|
| 44 |
+
| `action_type` | `"review" \| "patch" \| "comment" \| "submit"` | What the agent is doing |
|
| 45 |
+
| `severity` | `"critical" \| "major" \| "minor" \| "info"` | Issue severity (for `review`) |
|
| 46 |
+
| `issue_type` | `"bug" \| "security" \| "performance" \| "logic" \| "style"` | Issue category |
|
| 47 |
+
| `line_number` | `int \| null` | Line the issue is on |
|
| 48 |
+
| `description` | `str` | Concise natural-language description of the issue |
|
| 49 |
+
| `patched_code` | `str \| null` | Full corrected code (for `patch` actions) |
|
| 50 |
+
| `comment` | `str \| null` | Free-form annotation |
|
| 51 |
+
| `verdict` | `"approve" \| "request_changes" \| "reject"` | Final verdict (for `submit`) |
|
| 52 |
+
| `confidence` | `float [0.0, 1.0]` | Agent's self-reported confidence |
|
| 53 |
+
|
| 54 |
+
---
|
| 55 |
+
|
| 56 |
+
## Observation Space
|
| 57 |
+
|
| 58 |
+
Each step returns an `Observation` containing:
|
| 59 |
+
|
| 60 |
+
| Field | Description |
|
| 61 |
+
|---|---|
|
| 62 |
+
| `task_id` | Identifier of the current task |
|
| 63 |
+
| `step` / `max_steps` | Current step and budget |
|
| 64 |
+
| `review_context` | Full PR: title, author, description, code files, linter output, test results |
|
| 65 |
+
| `previous_actions` | All actions taken so far this episode |
|
| 66 |
+
| `issues_found_so_far` | Structured list of issues reported |
|
| 67 |
+
| `score_so_far` | Running cumulative intermediate reward |
|
| 68 |
+
| `done` | Whether the episode has ended |
|
| 69 |
+
|
| 70 |
+
---
|
| 71 |
+
|
| 72 |
+
## Reward Function
|
| 73 |
+
|
| 74 |
+
Reward is **dense** β provided at every step, not only at the end.
|
| 75 |
+
|
| 76 |
+
### Intermediate (per-step)
|
| 77 |
+
|
| 78 |
+
| Signal | Value | Rationale |
|
| 79 |
+
|---|---|---|
|
| 80 |
+
| Step penalty | β0.01 | Encourages efficiency |
|
| 81 |
+
| Review with description | +0.05 | Rewards substantive annotations |
|
| 82 |
+
| Critical severity bonus | +0.03 | Rewards correct triage |
|
| 83 |
+
| Patch submitted | +0.10 | Rewards producing a fix |
|
| 84 |
+
| Repetition penalty | β0.05 | Penalises looping / copy-paste |
|
| 85 |
+
|
| 86 |
+
### Terminal (on `submit` or step exhaustion)
|
| 87 |
+
|
| 88 |
+
The programmatic grader runs and returns a score in **[0.0, 1.0]** based on which issues were correctly identified and how well the submitted patch addresses them. This final score overwrites the episode total.
|
| 89 |
+
|
| 90 |
+
---
|
| 91 |
+
|
| 92 |
+
## Tasks
|
| 93 |
+
|
| 94 |
+
### Task 1 β Easy: Bug Hunt (`task_1_easy_bug_hunt`)
|
| 95 |
+
|
| 96 |
+
**Max steps:** 8
|
| 97 |
+
**File reviewed:** `utils.py` (Python, 30 lines)
|
| 98 |
+
|
| 99 |
+
A developer submits three utility functions. Three bugs are planted:
|
| 100 |
+
|
| 101 |
+
| # | Line | Bug | Severity |
|
| 102 |
+
|---|---|---|---|
|
| 103 |
+
| 1 | 3 | `=` (assignment) used instead of `==` (comparison) β causes `SyntaxError` | Critical |
|
| 104 |
+
| 2 | 6 | `range(1, len(numbers) + 1)` β off-by-one causes `IndexError` | Critical |
|
| 105 |
+
| 3 | 9 | Missing `return max_val` β function silently returns `None` | Major |
|
| 106 |
+
|
| 107 |
+
**Grading:** 30% per critical bug identified, 20% for minor, 20% for a syntactically valid patch with all three fixes applied.
|
| 108 |
+
|
| 109 |
+
---
|
| 110 |
+
|
| 111 |
+
### Task 2 β Medium: Security Audit (`task_2_medium_security`)
|
| 112 |
+
|
| 113 |
+
**Max steps:** 12
|
| 114 |
+
**File reviewed:** `auth.py` (Flask, 55 lines)
|
| 115 |
+
|
| 116 |
+
A backend developer submits login and registration endpoints. Six security vulnerabilities are present:
|
| 117 |
+
|
| 118 |
+
| # | Line | Vulnerability | Severity |
|
| 119 |
+
|---|---|---|---|
|
| 120 |
+
| 1 | 23 | SQL injection in `login` query (f-string interpolation) | Critical |
|
| 121 |
+
| 2 | 44 | SQL injection in `register` INSERT | Critical |
|
| 122 |
+
| 3 | 39 | Plaintext password storage (no hashing) | Critical |
|
| 123 |
+
| 4 | β | No rate limiting on `/login` (brute-force possible) | Major |
|
| 124 |
+
| 5 | 30 | Sensitive data leakage: error distinguishes "wrong password" vs "user not found" | Major |
|
| 125 |
+
| 6 | 5 | Hardcoded `secret_key` in source | Major |
|
| 126 |
+
|
| 127 |
+
**Grading:** Weighted by severity. Patch checked for parameterized queries, password hashing, and environment variable use.
|
| 128 |
+
|
| 129 |
+
---
|
| 130 |
+
|
| 131 |
+
### Task 3 β Hard: Distributed Systems Correctness (`task_3_hard_perf_correctness`)
|
| 132 |
+
|
| 133 |
+
**Max steps:** 16
|
| 134 |
+
**File reviewed:** `cache.py` (Python, 55 lines)
|
| 135 |
+
|
| 136 |
+
A senior engineer submits a Redis-backed LRU cache claimed to be production-ready. Six issues lurk:
|
| 137 |
+
|
| 138 |
+
| # | Issue | Type | Severity |
|
| 139 |
+
|---|---|---|---|
|
| 140 |
+
| 1 | Non-atomic `EXISTS` + `GET` creates a race condition | Concurrency | Critical |
|
| 141 |
+
| 2 | Local `dict` grows unboundedly β `capacity` parameter ignored | Performance | Critical |
|
| 142 |
+
| 3 | `get_many` calls `self.get()` in a loop (N+1 round trips) | Performance | Major |
|
| 143 |
+
| 4 | `dict` preserves insertion order, not access order β LRU eviction is wrong | Logic | Major |
|
| 144 |
+
| 5 | Shared `dict` modified without a `threading.Lock` | Concurrency | Critical |
|
| 145 |
+
| 6 | `pickle.loads` on bytes from Redis β arbitrary code execution | Security | Critical |
|
| 146 |
+
|
| 147 |
+
**Grading:** Equally weighted. Patch checked structurally for `threading.Lock`, `OrderedDict.move_to_end`, `mget`, and `json` instead of `pickle`.
|
| 148 |
+
|
| 149 |
+
---
|
| 150 |
+
|
| 151 |
+
## Baseline Performance
|
| 152 |
+
|
| 153 |
+
Evaluated with `Qwen/Qwen2.5-72B-Instruct` via Hugging Face Inference API:
|
| 154 |
+
|
| 155 |
+
| Task | Score |
|
| 156 |
+
|---|---|
|
| 157 |
+
| Task 1 β Easy | 0.72 |
|
| 158 |
+
| Task 2 β Medium | 0.55 |
|
| 159 |
+
| Task 3 β Hard | 0.38 |
|
| 160 |
+
| **Aggregate** | **0.55** |
|
| 161 |
+
|
| 162 |
+
---
|
| 163 |
+
|
| 164 |
+
## Setup & Usage
|
| 165 |
+
|
| 166 |
+
### 1. Local (Python)
|
| 167 |
+
|
| 168 |
+
```bash
|
| 169 |
+
git clone <repo>
|
| 170 |
+
cd code-review-env
|
| 171 |
+
pip install -r requirements.txt
|
| 172 |
+
python server.py
|
| 173 |
+
# Server running at http://localhost:7860
|
| 174 |
+
```
|
| 175 |
+
|
| 176 |
+
### 2. Docker
|
| 177 |
+
|
| 178 |
+
```bash
|
| 179 |
+
docker build -t code-review-env .
|
| 180 |
+
docker run -p 7860:7860 code-review-env
|
| 181 |
+
```
|
| 182 |
+
|
| 183 |
+
### 3. API Quickstart
|
| 184 |
+
|
| 185 |
+
```bash
|
| 186 |
+
# Reset to task 1
|
| 187 |
+
curl -X POST http://localhost:7860/reset \
|
| 188 |
+
-H "Content-Type: application/json" \
|
| 189 |
+
-d '{"task_id": "task_1_easy_bug_hunt"}'
|
| 190 |
+
|
| 191 |
+
# Take a step
|
| 192 |
+
curl -X POST http://localhost:7860/step \
|
| 193 |
+
-H "Content-Type: application/json" \
|
| 194 |
+
-d '{
|
| 195 |
+
"session_id": "<session_id>",
|
| 196 |
+
"action": {
|
| 197 |
+
"action_type": "review",
|
| 198 |
+
"severity": "critical",
|
| 199 |
+
"issue_type": "bug",
|
| 200 |
+
"line_number": 3,
|
| 201 |
+
"description": "Assignment operator = used instead of comparison == on line 3"
|
| 202 |
+
}
|
| 203 |
+
}'
|
| 204 |
+
```
|
| 205 |
+
|
| 206 |
+
### 4. Run inference script
|
| 207 |
+
|
| 208 |
+
```bash
|
| 209 |
+
export HF_TOKEN=hf_your_token_here
|
| 210 |
+
export API_BASE_URL=https://router.huggingface.co/v1
|
| 211 |
+
export MODEL_NAME=Qwen/Qwen2.5-72B-Instruct
|
| 212 |
+
python inference.py
|
| 213 |
+
```
|
| 214 |
+
|
| 215 |
+
Expected stdout format:
|
| 216 |
+
```
|
| 217 |
+
[START] task=task_1_easy_bug_hunt env=code-review-env model=Qwen/Qwen2.5-72B-Instruct
|
| 218 |
+
[STEP] step=1 action=review:assignment operator = instead of == reward=0.07 done=false error=null
|
| 219 |
+
[STEP] step=2 action=review:off-by-one in range reward=0.07 done=false error=null
|
| 220 |
+
[STEP] step=3 action=patch:fixed code reward=0.10 done=false error=null
|
| 221 |
+
[STEP] step=4 action=submit:request_changes reward=1.00 done=true error=null
|
| 222 |
+
[END] success=true steps=4 score=1.000 rewards=0.07,0.07,0.10,1.00
|
| 223 |
+
```
|
| 224 |
+
|
| 225 |
+
### 5. OpenEnv validation
|
| 226 |
+
|
| 227 |
+
```bash
|
| 228 |
+
openenv validate .
|
| 229 |
+
```
|
| 230 |
+
|
| 231 |
+
---
|
| 232 |
+
|
| 233 |
+
## HTTP API Reference
|
| 234 |
+
|
| 235 |
+
| Method | Endpoint | Description |
|
| 236 |
+
|---|---|---|
|
| 237 |
+
| `GET` | `/` | Environment info |
|
| 238 |
+
| `GET` | `/tasks` | List all tasks |
|
| 239 |
+
| `POST` | `/reset` | Start a new episode |
|
| 240 |
+
| `POST` | `/step` | Take an action |
|
| 241 |
+
| `GET` | `/state/{session_id}` | Inspect full environment state |
|
| 242 |
+
| `DELETE` | `/session/{session_id}` | Clean up session |
|
| 243 |
+
|
| 244 |
+
---
|
| 245 |
+
|
| 246 |
+
## Hugging Face Spaces Deployment
|
| 247 |
+
|
| 248 |
+
The `Dockerfile` targets port `7860` and runs as a non-root user β compatible with HF Spaces Docker SDK out of the box. Tag the Space with `openenv`.
|
| 249 |
+
|
| 250 |
+
```yaml
|
| 251 |
+
# README header for HF Spaces
|
| 252 |
+
---
|
| 253 |
+
title: CodeReviewEnv
|
| 254 |
+
emoji: π
|
| 255 |
+
colorFrom: indigo
|
| 256 |
+
colorTo: blue
|
| 257 |
+
sdk: docker
|
| 258 |
+
pinned: false
|
| 259 |
+
tags:
|
| 260 |
+
- openenv
|
| 261 |
+
---
|
| 262 |
+
```
|
agents/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# agents package
|
agents/baseline_agent.py
ADDED
|
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Baseline inference script for CodeReviewEnv.
|
| 3 |
+
|
| 4 |
+
Evaluates a model (via OpenAI-compatible API) across all three tasks and
|
| 5 |
+
reports per-task and aggregate scores.
|
| 6 |
+
|
| 7 |
+
Usage:
|
| 8 |
+
HF_TOKEN=<your_token> python agents/baseline_agent.py [--model MODEL] [--server URL]
|
| 9 |
+
|
| 10 |
+
The script uses the Hugging Face Inference API (OpenAI-compatible endpoint)
|
| 11 |
+
with the model specified via --model (default: Qwen/Qwen2.5-72B-Instruct).
|
| 12 |
+
"""
|
| 13 |
+
from __future__ import annotations
|
| 14 |
+
|
| 15 |
+
import argparse
|
| 16 |
+
import json
|
| 17 |
+
import os
|
| 18 |
+
import sys
|
| 19 |
+
import time
|
| 20 |
+
from typing import Any, Dict, List
|
| 21 |
+
|
| 22 |
+
import requests
|
| 23 |
+
from openai import OpenAI
|
| 24 |
+
|
| 25 |
+
# ββ Config ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 26 |
+
|
| 27 |
+
DEFAULT_MODEL = "Qwen/Qwen2.5-72B-Instruct"
|
| 28 |
+
DEFAULT_SERVER = "http://localhost:7860"
|
| 29 |
+
HF_BASE_URL = "https://api-inference.huggingface.co/v1"
|
| 30 |
+
|
| 31 |
+
TASK_IDS = [
|
| 32 |
+
"task_1_easy_bug_hunt",
|
| 33 |
+
"task_2_medium_security",
|
| 34 |
+
"task_3_hard_perf_correctness",
|
| 35 |
+
]
|
| 36 |
+
|
| 37 |
+
# ββ Prompts βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 38 |
+
|
| 39 |
+
SYSTEM_PROMPT = """\
|
| 40 |
+
You are an expert software engineer performing a thorough code review.
|
| 41 |
+
Your task is to:
|
| 42 |
+
1. Carefully read the provided code.
|
| 43 |
+
2. Identify ALL bugs, security vulnerabilities, performance issues, and correctness problems.
|
| 44 |
+
3. For each issue, output a JSON action with action_type="review".
|
| 45 |
+
4. After all issues are identified, output a patch with action_type="patch".
|
| 46 |
+
5. Finally, output action_type="submit" with your verdict.
|
| 47 |
+
|
| 48 |
+
Each action must be valid JSON matching this schema:
|
| 49 |
+
{
|
| 50 |
+
"action_type": "review" | "patch" | "comment" | "submit",
|
| 51 |
+
"severity": "critical" | "major" | "minor" | "info", // for review
|
| 52 |
+
"issue_type": "bug" | "security" | "performance" | "logic" | "style",
|
| 53 |
+
"line_number": <int or null>,
|
| 54 |
+
"description": "<concise description of the issue>",
|
| 55 |
+
"patched_code": "<full corrected code>", // for patch
|
| 56 |
+
"comment": "<optional comment>",
|
| 57 |
+
"verdict": "approve" | "request_changes" | "reject", // for submit
|
| 58 |
+
"confidence": <0.0-1.0>
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
Output ONE action JSON per message. Be precise and thorough.
|
| 62 |
+
"""
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def build_user_prompt(obs: Dict[str, Any]) -> str:
|
| 66 |
+
ctx = obs["review_context"]
|
| 67 |
+
files_text = "\n\n".join(
|
| 68 |
+
f"=== {f['filename']} ({f['language']}) ===\n{f['content']}"
|
| 69 |
+
for f in ctx["files_changed"]
|
| 70 |
+
)
|
| 71 |
+
prev = obs.get("previous_actions", [])
|
| 72 |
+
issues_so_far = obs.get("issues_found_so_far", [])
|
| 73 |
+
|
| 74 |
+
prompt = f"""Pull Request: {ctx['pull_request_title']}
|
| 75 |
+
Author: {ctx['author']}
|
| 76 |
+
Description: {ctx['description']}
|
| 77 |
+
|
| 78 |
+
Linter: {ctx.get('linter_output', 'N/A')}
|
| 79 |
+
Tests: {ctx.get('test_results', 'N/A')}
|
| 80 |
+
|
| 81 |
+
--- CODE ---
|
| 82 |
+
{files_text}
|
| 83 |
+
--- END CODE ---
|
| 84 |
+
|
| 85 |
+
Steps taken so far: {obs['step']} / {obs['max_steps']}
|
| 86 |
+
Issues identified so far: {len(issues_so_far)}
|
| 87 |
+
"""
|
| 88 |
+
if issues_so_far:
|
| 89 |
+
prompt += "\nIssues already reported:\n"
|
| 90 |
+
for iss in issues_so_far:
|
| 91 |
+
prompt += f" - [{iss.get('severity','?')}] line {iss.get('line','?')}: {iss.get('description','')}\n"
|
| 92 |
+
|
| 93 |
+
if obs["step"] == 0:
|
| 94 |
+
prompt += "\nPlease begin your review. Output your first action as JSON."
|
| 95 |
+
elif obs["step"] >= obs["max_steps"] - 2:
|
| 96 |
+
prompt += "\nYou are running low on steps. Please submit a patch and final verdict now."
|
| 97 |
+
else:
|
| 98 |
+
prompt += "\nContinue your review or submit if done. Output next action as JSON."
|
| 99 |
+
|
| 100 |
+
return prompt
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
# ββ Agent loop ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 104 |
+
|
| 105 |
+
def extract_json(text: str) -> Dict[str, Any]:
|
| 106 |
+
"""Extract first JSON object from model response."""
|
| 107 |
+
# Try direct parse first
|
| 108 |
+
text = text.strip()
|
| 109 |
+
try:
|
| 110 |
+
return json.loads(text)
|
| 111 |
+
except json.JSONDecodeError:
|
| 112 |
+
pass
|
| 113 |
+
# Find JSON block
|
| 114 |
+
start = text.find("{")
|
| 115 |
+
if start == -1:
|
| 116 |
+
raise ValueError("No JSON found in response")
|
| 117 |
+
depth = 0
|
| 118 |
+
for i, ch in enumerate(text[start:], start):
|
| 119 |
+
if ch == "{":
|
| 120 |
+
depth += 1
|
| 121 |
+
elif ch == "}":
|
| 122 |
+
depth -= 1
|
| 123 |
+
if depth == 0:
|
| 124 |
+
return json.loads(text[start : i + 1])
|
| 125 |
+
raise ValueError("Unbalanced JSON")
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def run_episode(
|
| 129 |
+
client: OpenAI,
|
| 130 |
+
model: str,
|
| 131 |
+
server: str,
|
| 132 |
+
task_id: str,
|
| 133 |
+
) -> Dict[str, Any]:
|
| 134 |
+
"""Run a single episode and return the result dict."""
|
| 135 |
+
|
| 136 |
+
# 1. Reset
|
| 137 |
+
resp = requests.post(f"{server}/reset", json={"task_id": task_id}, timeout=30)
|
| 138 |
+
resp.raise_for_status()
|
| 139 |
+
data = resp.json()
|
| 140 |
+
session_id = data["session_id"]
|
| 141 |
+
obs = data["observation"]
|
| 142 |
+
|
| 143 |
+
print(f"\n{'='*60}")
|
| 144 |
+
print(f"Task: {task_id}")
|
| 145 |
+
print(f"Session: {session_id}")
|
| 146 |
+
print(f"{'='*60}")
|
| 147 |
+
|
| 148 |
+
history: List[Dict[str, str]] = []
|
| 149 |
+
final_score = 0.0
|
| 150 |
+
done = False
|
| 151 |
+
patch_submitted = False
|
| 152 |
+
|
| 153 |
+
while not done:
|
| 154 |
+
user_msg = build_user_prompt(obs)
|
| 155 |
+
history.append({"role": "user", "content": user_msg})
|
| 156 |
+
|
| 157 |
+
# Call model
|
| 158 |
+
try:
|
| 159 |
+
completion = client.chat.completions.create(
|
| 160 |
+
model=model,
|
| 161 |
+
messages=[{"role": "system", "content": SYSTEM_PROMPT}] + history,
|
| 162 |
+
max_tokens=1024,
|
| 163 |
+
temperature=0.2,
|
| 164 |
+
)
|
| 165 |
+
raw = completion.choices[0].message.content or ""
|
| 166 |
+
except Exception as exc:
|
| 167 |
+
print(f" [Model error] {exc}")
|
| 168 |
+
break
|
| 169 |
+
|
| 170 |
+
history.append({"role": "assistant", "content": raw})
|
| 171 |
+
|
| 172 |
+
# Parse action
|
| 173 |
+
try:
|
| 174 |
+
action_dict = extract_json(raw)
|
| 175 |
+
except ValueError as exc:
|
| 176 |
+
print(f" [Parse error] {exc} | raw={raw[:200]!r}")
|
| 177 |
+
# Force a submit to avoid infinite spin
|
| 178 |
+
action_dict = {"action_type": "submit", "verdict": "request_changes", "confidence": 0.3}
|
| 179 |
+
|
| 180 |
+
action_type = action_dict.get("action_type", "review")
|
| 181 |
+
print(f" Step {obs['step']+1}: {action_type} | {action_dict.get('description','')[:80]}")
|
| 182 |
+
|
| 183 |
+
# Auto-submit near step limit
|
| 184 |
+
if obs["step"] >= obs["max_steps"] - 1 and action_type != "submit":
|
| 185 |
+
action_dict = {"action_type": "submit", "verdict": "request_changes", "confidence": 0.5}
|
| 186 |
+
if not patch_submitted:
|
| 187 |
+
# Submit a patch first
|
| 188 |
+
action_dict = {
|
| 189 |
+
"action_type": "patch",
|
| 190 |
+
"patched_code": obs["review_context"]["files_changed"][0]["content"],
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
if action_type == "patch":
|
| 194 |
+
patch_submitted = True
|
| 195 |
+
|
| 196 |
+
# Step
|
| 197 |
+
step_resp = requests.post(
|
| 198 |
+
f"{server}/step",
|
| 199 |
+
json={"session_id": session_id, "action": action_dict},
|
| 200 |
+
timeout=30,
|
| 201 |
+
)
|
| 202 |
+
step_resp.raise_for_status()
|
| 203 |
+
step_data = step_resp.json()
|
| 204 |
+
obs = step_data["observation"]
|
| 205 |
+
done = step_data["done"]
|
| 206 |
+
info = step_data.get("info", {})
|
| 207 |
+
|
| 208 |
+
if done:
|
| 209 |
+
final_score = info.get("final_score", 0.0)
|
| 210 |
+
breakdown = info.get("breakdown", {})
|
| 211 |
+
print(f"\n Final score: {final_score:.4f}")
|
| 212 |
+
print(f" Breakdown: {json.dumps(breakdown, indent=4)}")
|
| 213 |
+
|
| 214 |
+
time.sleep(0.3) # be polite to the API
|
| 215 |
+
|
| 216 |
+
# Cleanup
|
| 217 |
+
requests.delete(f"{server}/session/{session_id}", timeout=10)
|
| 218 |
+
|
| 219 |
+
return {
|
| 220 |
+
"task_id": task_id,
|
| 221 |
+
"final_score": final_score,
|
| 222 |
+
"steps_taken": obs["step"],
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
# ββ Main ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 227 |
+
|
| 228 |
+
def main():
|
| 229 |
+
parser = argparse.ArgumentParser(description="CodeReviewEnv baseline agent")
|
| 230 |
+
parser.add_argument("--model", default=DEFAULT_MODEL)
|
| 231 |
+
parser.add_argument("--server", default=DEFAULT_SERVER)
|
| 232 |
+
parser.add_argument("--task", default=None, help="Run a single task (default: all)")
|
| 233 |
+
args = parser.parse_args()
|
| 234 |
+
|
| 235 |
+
hf_token = os.environ.get("HF_TOKEN")
|
| 236 |
+
if not hf_token:
|
| 237 |
+
print("ERROR: HF_TOKEN environment variable not set.", file=sys.stderr)
|
| 238 |
+
sys.exit(1)
|
| 239 |
+
|
| 240 |
+
client = OpenAI(
|
| 241 |
+
api_key=hf_token,
|
| 242 |
+
base_url=HF_BASE_URL,
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
tasks = [args.task] if args.task else TASK_IDS
|
| 246 |
+
results = []
|
| 247 |
+
|
| 248 |
+
for task_id in tasks:
|
| 249 |
+
result = run_episode(client, args.model, args.server, task_id)
|
| 250 |
+
results.append(result)
|
| 251 |
+
|
| 252 |
+
# Summary
|
| 253 |
+
print("\n" + "=" * 60)
|
| 254 |
+
print("BASELINE SUMMARY")
|
| 255 |
+
print("=" * 60)
|
| 256 |
+
for r in results:
|
| 257 |
+
print(f" {r['task_id']:<40} score={r['final_score']:.4f} steps={r['steps_taken']}")
|
| 258 |
+
|
| 259 |
+
if len(results) == len(TASK_IDS):
|
| 260 |
+
avg = sum(r["final_score"] for r in results) / len(results)
|
| 261 |
+
print(f"\n Aggregate average score: {avg:.4f}")
|
| 262 |
+
|
| 263 |
+
# Save results
|
| 264 |
+
out_path = "baseline_results.json"
|
| 265 |
+
with open(out_path, "w") as f:
|
| 266 |
+
json.dump({"model": args.model, "results": results}, f, indent=2)
|
| 267 |
+
print(f"\n Results saved to {out_path}")
|
| 268 |
+
|
| 269 |
+
|
| 270 |
+
if __name__ == "__main__":
|
| 271 |
+
main()
|
env.py
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CodeReviewEnv β OpenEnv-compliant environment.
|
| 3 |
+
|
| 4 |
+
Implements:
|
| 5 |
+
reset() β Observation
|
| 6 |
+
step(action) β (Observation, StepReward, done, info)
|
| 7 |
+
state() β EnvironmentState
|
| 8 |
+
"""
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
import copy
|
| 12 |
+
import sys
|
| 13 |
+
import os
|
| 14 |
+
|
| 15 |
+
sys.path.insert(0, os.path.dirname(__file__))
|
| 16 |
+
|
| 17 |
+
from typing import Any, Dict, Tuple
|
| 18 |
+
|
| 19 |
+
from models import (
|
| 20 |
+
CodeFile,
|
| 21 |
+
EnvironmentState,
|
| 22 |
+
Observation,
|
| 23 |
+
ReviewAction,
|
| 24 |
+
ReviewContext,
|
| 25 |
+
StepReward,
|
| 26 |
+
)
|
| 27 |
+
from graders.grader import grade
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# ββ Task registry ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 31 |
+
def _load_task(task_id: str) -> Dict[str, Any]:
|
| 32 |
+
if task_id == "task_1_easy_bug_hunt":
|
| 33 |
+
from tasks.task1_easy import get_task_config
|
| 34 |
+
elif task_id == "task_2_medium_security":
|
| 35 |
+
from tasks.task2_medium import get_task_config
|
| 36 |
+
elif task_id == "task_3_hard_perf_correctness":
|
| 37 |
+
from tasks.task3_hard import get_task_config
|
| 38 |
+
else:
|
| 39 |
+
raise ValueError(f"Unknown task_id: {task_id!r}")
|
| 40 |
+
return get_task_config()
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
TASK_IDS = [
|
| 44 |
+
"task_1_easy_bug_hunt",
|
| 45 |
+
"task_2_medium_security",
|
| 46 |
+
"task_3_hard_perf_correctness",
|
| 47 |
+
]
|
| 48 |
+
|
| 49 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class CodeReviewEnv:
|
| 53 |
+
"""OpenEnv-compliant code review environment."""
|
| 54 |
+
|
| 55 |
+
# ββ Lifecycle ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 56 |
+
|
| 57 |
+
def reset(self, task_id: str = "task_1_easy_bug_hunt") -> Observation:
|
| 58 |
+
"""Reset the environment for a given task. Returns the initial observation."""
|
| 59 |
+
cfg = _load_task(task_id)
|
| 60 |
+
pr = cfg["pull_request"]
|
| 61 |
+
|
| 62 |
+
files = [CodeFile(**f) for f in pr["files_changed"]]
|
| 63 |
+
review_ctx = ReviewContext(
|
| 64 |
+
pull_request_title=pr["pull_request_title"],
|
| 65 |
+
author=pr["author"],
|
| 66 |
+
description=pr["description"],
|
| 67 |
+
files_changed=files,
|
| 68 |
+
test_results=pr.get("test_results"),
|
| 69 |
+
linter_output=pr.get("linter_output"),
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
self._state = EnvironmentState(
|
| 73 |
+
task_id=task_id,
|
| 74 |
+
step=0,
|
| 75 |
+
max_steps=cfg["max_steps"],
|
| 76 |
+
review_context=review_ctx,
|
| 77 |
+
)
|
| 78 |
+
self._cfg = cfg
|
| 79 |
+
return self._make_observation()
|
| 80 |
+
|
| 81 |
+
def step(self, action: ReviewAction) -> Tuple[Observation, StepReward, bool, Dict[str, Any]]:
|
| 82 |
+
"""
|
| 83 |
+
Apply an action. Returns (observation, reward, done, info).
|
| 84 |
+
Raises RuntimeError if called before reset().
|
| 85 |
+
"""
|
| 86 |
+
if not hasattr(self, "_state"):
|
| 87 |
+
raise RuntimeError("Call reset() before step().")
|
| 88 |
+
|
| 89 |
+
s = self._state
|
| 90 |
+
|
| 91 |
+
# ββ Terminal check βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 92 |
+
if s.done:
|
| 93 |
+
obs = self._make_observation(feedback="Episode already finished.")
|
| 94 |
+
return obs, StepReward(value=0.0, explanation="Episode done."), True, {}
|
| 95 |
+
|
| 96 |
+
s.step += 1
|
| 97 |
+
|
| 98 |
+
# ββ Absorb action ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 99 |
+
s.actions_taken.append(action)
|
| 100 |
+
|
| 101 |
+
# Record issue if it is a review action
|
| 102 |
+
if action.action_type == "review" and action.description:
|
| 103 |
+
issue = {
|
| 104 |
+
"step": s.step,
|
| 105 |
+
"severity": action.severity,
|
| 106 |
+
"issue_type": action.issue_type,
|
| 107 |
+
"line": action.line_number,
|
| 108 |
+
"description": action.description,
|
| 109 |
+
}
|
| 110 |
+
s.issues_identified.append(issue)
|
| 111 |
+
|
| 112 |
+
# Record patch
|
| 113 |
+
if action.action_type == "patch" and action.patched_code:
|
| 114 |
+
s.patch_submitted = action.patched_code
|
| 115 |
+
|
| 116 |
+
# Record verdict
|
| 117 |
+
if action.action_type == "submit" and action.verdict:
|
| 118 |
+
s.verdict_submitted = action.verdict
|
| 119 |
+
|
| 120 |
+
# ββ Reward βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 121 |
+
reward = self._compute_step_reward(action)
|
| 122 |
+
s.total_reward += reward.value
|
| 123 |
+
|
| 124 |
+
# ββ Done condition βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 125 |
+
submitted = action.action_type == "submit"
|
| 126 |
+
out_of_steps = s.step >= s.max_steps
|
| 127 |
+
|
| 128 |
+
if submitted or out_of_steps:
|
| 129 |
+
final_score, breakdown = grade(s)
|
| 130 |
+
s.total_reward = final_score
|
| 131 |
+
s.done = True
|
| 132 |
+
s.terminated_reason = "submitted" if submitted else "max_steps_reached"
|
| 133 |
+
reward = StepReward(
|
| 134 |
+
value=final_score,
|
| 135 |
+
breakdown=breakdown,
|
| 136 |
+
explanation=f"Final score: {final_score:.3f}",
|
| 137 |
+
)
|
| 138 |
+
info = {"final_score": final_score, "breakdown": breakdown, "reason": s.terminated_reason}
|
| 139 |
+
else:
|
| 140 |
+
info = {"step": s.step, "cumulative_reward": s.total_reward}
|
| 141 |
+
|
| 142 |
+
obs = self._make_observation()
|
| 143 |
+
return obs, reward, s.done, info
|
| 144 |
+
|
| 145 |
+
def state(self) -> EnvironmentState:
|
| 146 |
+
if not hasattr(self, "_state"):
|
| 147 |
+
raise RuntimeError("Call reset() before state().")
|
| 148 |
+
return copy.deepcopy(self._state)
|
| 149 |
+
|
| 150 |
+
# ββ Internal helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 151 |
+
|
| 152 |
+
def _make_observation(self, feedback: str | None = None) -> Observation:
|
| 153 |
+
s = self._state
|
| 154 |
+
return Observation(
|
| 155 |
+
task_id=s.task_id,
|
| 156 |
+
step=s.step,
|
| 157 |
+
max_steps=s.max_steps,
|
| 158 |
+
review_context=s.review_context,
|
| 159 |
+
previous_actions=list(s.actions_taken),
|
| 160 |
+
feedback=feedback,
|
| 161 |
+
issues_found_so_far=list(s.issues_identified),
|
| 162 |
+
score_so_far=s.total_reward,
|
| 163 |
+
done=s.done,
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
def _compute_step_reward(self, action: ReviewAction) -> StepReward:
|
| 167 |
+
"""
|
| 168 |
+
Dense intermediate reward:
|
| 169 |
+
+0.05 for a review action with a non-empty description
|
| 170 |
+
+0.03 for a review action with severity='critical'
|
| 171 |
+
+0.10 for a patch action with non-empty code
|
| 172 |
+
-0.05 for repeated identical descriptions (loop detection)
|
| 173 |
+
-0.10 step penalty (encourages efficiency)
|
| 174 |
+
"""
|
| 175 |
+
s = self._state
|
| 176 |
+
r = 0.0
|
| 177 |
+
parts: Dict[str, float] = {}
|
| 178 |
+
|
| 179 |
+
STEP_PENALTY = -0.01
|
| 180 |
+
r += STEP_PENALTY
|
| 181 |
+
parts["step_penalty"] = STEP_PENALTY
|
| 182 |
+
|
| 183 |
+
if action.action_type == "review":
|
| 184 |
+
if action.description:
|
| 185 |
+
parts["review_description"] = 0.05
|
| 186 |
+
r += 0.05
|
| 187 |
+
if action.severity == "critical":
|
| 188 |
+
parts["critical_severity_bonus"] = 0.03
|
| 189 |
+
r += 0.03
|
| 190 |
+
# Loop detection: penalise if same description appeared before
|
| 191 |
+
prev_descs = [
|
| 192 |
+
a.description for a in s.actions_taken[:-1]
|
| 193 |
+
if a.description
|
| 194 |
+
]
|
| 195 |
+
if action.description and action.description in prev_descs:
|
| 196 |
+
parts["repetition_penalty"] = -0.05
|
| 197 |
+
r += -0.05
|
| 198 |
+
|
| 199 |
+
elif action.action_type == "patch":
|
| 200 |
+
if action.patched_code and len(action.patched_code) > 50:
|
| 201 |
+
parts["patch_submitted"] = 0.10
|
| 202 |
+
r += 0.10
|
| 203 |
+
|
| 204 |
+
elif action.action_type == "submit":
|
| 205 |
+
pass # final score handled in step()
|
| 206 |
+
|
| 207 |
+
return StepReward(
|
| 208 |
+
value=max(-1.0, min(1.0, r)),
|
| 209 |
+
breakdown=parts,
|
| 210 |
+
explanation=f"Step {s.step} intermediate reward",
|
| 211 |
+
)
|
graders/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# graders package
|
graders/grader.py
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Programmatic graders for all three tasks.
|
| 3 |
+
Each grader returns a score in [0.0, 1.0] with a breakdown dict.
|
| 4 |
+
Grading is deterministic: keyword matching + structural checks.
|
| 5 |
+
"""
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
import ast
|
| 8 |
+
import re
|
| 9 |
+
from typing import Any, Dict, List, Tuple
|
| 10 |
+
|
| 11 |
+
from models import ReviewAction, EnvironmentState
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# βββ Helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 15 |
+
|
| 16 |
+
def _keywords_hit(text: str, keywords: List[str]) -> bool:
|
| 17 |
+
"""Return True if any keyword appears in text (case-insensitive)."""
|
| 18 |
+
t = text.lower()
|
| 19 |
+
return any(kw.lower() in t for kw in keywords)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def _actions_mention_bug(actions: List[ReviewAction], bug: Dict[str, Any]) -> bool:
|
| 23 |
+
"""Check whether any action mentions the given bug via keyword matching."""
|
| 24 |
+
keywords = bug["description_keywords"]
|
| 25 |
+
for action in actions:
|
| 26 |
+
text = " ".join(filter(None, [
|
| 27 |
+
action.description or "",
|
| 28 |
+
action.comment or "",
|
| 29 |
+
action.issue_type or "",
|
| 30 |
+
]))
|
| 31 |
+
if _keywords_hit(text, keywords):
|
| 32 |
+
return True
|
| 33 |
+
return False
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def _patch_fixes_syntax(patched_code: str) -> bool:
|
| 37 |
+
"""Try to parse the patched code as valid Python."""
|
| 38 |
+
try:
|
| 39 |
+
ast.parse(patched_code)
|
| 40 |
+
return True
|
| 41 |
+
except SyntaxError:
|
| 42 |
+
return False
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def _patch_contains_fix(patched_code: str, fix_keywords: List[str]) -> bool:
|
| 46 |
+
return _keywords_hit(patched_code, fix_keywords)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
# βββ Task 1 Grader ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 50 |
+
|
| 51 |
+
def grade_task1(state: EnvironmentState) -> Tuple[float, Dict[str, float]]:
|
| 52 |
+
"""
|
| 53 |
+
Score breakdown:
|
| 54 |
+
- 30% : identified comparison operator bug (= vs ==)
|
| 55 |
+
- 30% : identified off-by-one bug
|
| 56 |
+
- 20% : identified missing return
|
| 57 |
+
- 20% : patch parses correctly and contains all three fixes
|
| 58 |
+
"""
|
| 59 |
+
from tasks.task1_easy import KNOWN_BUGS
|
| 60 |
+
|
| 61 |
+
actions = state.actions_taken
|
| 62 |
+
breakdown: Dict[str, float] = {}
|
| 63 |
+
|
| 64 |
+
# Bug identification (60% total, 20 each)
|
| 65 |
+
for bug_name, bug_info in KNOWN_BUGS.items():
|
| 66 |
+
hit = _actions_mention_bug(actions, bug_info)
|
| 67 |
+
weight = 0.30 if bug_info["severity"] == "critical" else 0.20
|
| 68 |
+
breakdown[f"found_{bug_name}"] = weight if hit else 0.0
|
| 69 |
+
|
| 70 |
+
# Patch quality (20%)
|
| 71 |
+
patch_score = 0.0
|
| 72 |
+
if state.patch_submitted:
|
| 73 |
+
p = state.patch_submitted
|
| 74 |
+
if _patch_fixes_syntax(p):
|
| 75 |
+
patch_score += 0.10
|
| 76 |
+
if "==" in p and "= 0" not in p.replace("==", ""):
|
| 77 |
+
patch_score += 0.04
|
| 78 |
+
if "range(1, len(numbers))" in p:
|
| 79 |
+
patch_score += 0.03
|
| 80 |
+
if re.search(r"return\s+max_val", p):
|
| 81 |
+
patch_score += 0.03
|
| 82 |
+
breakdown["patch_quality"] = patch_score
|
| 83 |
+
|
| 84 |
+
total = sum(breakdown.values())
|
| 85 |
+
return min(total, 1.0), breakdown
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
# βββ Task 2 Grader ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 89 |
+
|
| 90 |
+
def grade_task2(state: EnvironmentState) -> Tuple[float, Dict[str, float]]:
|
| 91 |
+
"""
|
| 92 |
+
Score breakdown:
|
| 93 |
+
- 20% : identified SQL injection (login)
|
| 94 |
+
- 20% : identified SQL injection (register)
|
| 95 |
+
- 15% : identified plaintext password
|
| 96 |
+
- 10% : identified no rate limiting
|
| 97 |
+
- 10% : identified sensitive data leakage
|
| 98 |
+
- 05% : identified hardcoded secret
|
| 99 |
+
- 20% : patch uses parameterized queries + password hashing
|
| 100 |
+
"""
|
| 101 |
+
from tasks.task2_medium import KNOWN_VULNERABILITIES
|
| 102 |
+
|
| 103 |
+
actions = state.actions_taken
|
| 104 |
+
breakdown: Dict[str, float] = {}
|
| 105 |
+
|
| 106 |
+
weights = {
|
| 107 |
+
"sql_injection_login": 0.20,
|
| 108 |
+
"sql_injection_register": 0.20,
|
| 109 |
+
"plaintext_password": 0.15,
|
| 110 |
+
"no_rate_limiting": 0.10,
|
| 111 |
+
"sensitive_data_leak": 0.10,
|
| 112 |
+
"hardcoded_secret": 0.05,
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
for vuln_name, vuln_info in KNOWN_VULNERABILITIES.items():
|
| 116 |
+
hit = _actions_mention_bug(actions, vuln_info)
|
| 117 |
+
breakdown[f"found_{vuln_name}"] = weights[vuln_name] if hit else 0.0
|
| 118 |
+
|
| 119 |
+
# Patch quality (20%)
|
| 120 |
+
patch_score = 0.0
|
| 121 |
+
if state.patch_submitted:
|
| 122 |
+
p = state.patch_submitted
|
| 123 |
+
if _patch_fixes_syntax(p):
|
| 124 |
+
patch_score += 0.05
|
| 125 |
+
if "?" in p and "execute" in p: # parameterized
|
| 126 |
+
patch_score += 0.07
|
| 127 |
+
if _patch_contains_fix(p, ["generate_password_hash", "bcrypt", "argon2", "pbkdf2"]):
|
| 128 |
+
patch_score += 0.05
|
| 129 |
+
if _patch_contains_fix(p, ["os.environ", "environ.get", "getenv"]):
|
| 130 |
+
patch_score += 0.03
|
| 131 |
+
breakdown["patch_quality"] = patch_score
|
| 132 |
+
|
| 133 |
+
total = sum(breakdown.values())
|
| 134 |
+
return min(total, 1.0), breakdown
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
# βββ Task 3 Grader ββββοΏ½οΏ½οΏ½βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 138 |
+
|
| 139 |
+
def grade_task3(state: EnvironmentState) -> Tuple[float, Dict[str, float]]:
|
| 140 |
+
"""
|
| 141 |
+
Score breakdown:
|
| 142 |
+
- 15% : race condition
|
| 143 |
+
- 15% : memory leak / missing eviction
|
| 144 |
+
- 15% : N+1 query / mget
|
| 145 |
+
- 10% : LRU order correctness
|
| 146 |
+
- 15% : thread safety
|
| 147 |
+
- 15% : pickle deserialization vulnerability
|
| 148 |
+
- 15% : patch quality (structural checks)
|
| 149 |
+
"""
|
| 150 |
+
from tasks.task3_hard import KNOWN_ISSUES
|
| 151 |
+
|
| 152 |
+
actions = state.actions_taken
|
| 153 |
+
breakdown: Dict[str, float] = {}
|
| 154 |
+
|
| 155 |
+
weights = {
|
| 156 |
+
"race_condition": 0.15,
|
| 157 |
+
"memory_leak": 0.15,
|
| 158 |
+
"n_plus_one": 0.15,
|
| 159 |
+
"wrong_lru_order": 0.10,
|
| 160 |
+
"thread_safety": 0.15,
|
| 161 |
+
"pickle_injection": 0.15,
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
for issue_name, issue_info in KNOWN_ISSUES.items():
|
| 165 |
+
hit = _actions_mention_bug(actions, issue_info)
|
| 166 |
+
breakdown[f"found_{issue_name}"] = weights[issue_name] if hit else 0.0
|
| 167 |
+
|
| 168 |
+
# Patch quality (15%)
|
| 169 |
+
patch_score = 0.0
|
| 170 |
+
if state.patch_submitted:
|
| 171 |
+
p = state.patch_submitted
|
| 172 |
+
if _patch_fixes_syntax(p):
|
| 173 |
+
patch_score += 0.03
|
| 174 |
+
if _patch_contains_fix(p, ["threading.Lock", "Lock()", "_lock"]):
|
| 175 |
+
patch_score += 0.03
|
| 176 |
+
if _patch_contains_fix(p, ["OrderedDict", "move_to_end"]):
|
| 177 |
+
patch_score += 0.03
|
| 178 |
+
if _patch_contains_fix(p, ["mget", "pipeline"]):
|
| 179 |
+
patch_score += 0.03
|
| 180 |
+
if _patch_contains_fix(p, ["json.loads", "json.dumps"]) and "pickle" not in p:
|
| 181 |
+
patch_score += 0.03
|
| 182 |
+
breakdown["patch_quality"] = patch_score
|
| 183 |
+
|
| 184 |
+
total = sum(breakdown.values())
|
| 185 |
+
return min(total, 1.0), breakdown
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
# βββ Dispatcher ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 189 |
+
|
| 190 |
+
GRADERS = {
|
| 191 |
+
"task_1_easy_bug_hunt": grade_task1,
|
| 192 |
+
"task_2_medium_security": grade_task2,
|
| 193 |
+
"task_3_hard_perf_correctness": grade_task3,
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def grade(state: EnvironmentState) -> Tuple[float, Dict[str, float]]:
|
| 198 |
+
grader = GRADERS.get(state.task_id)
|
| 199 |
+
if grader is None:
|
| 200 |
+
raise ValueError(f"No grader found for task_id={state.task_id!r}")
|
| 201 |
+
return grader(state)
|
inference.py
ADDED
|
@@ -0,0 +1,275 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
inference.py β CodeReviewEnv baseline inference script.
|
| 3 |
+
|
| 4 |
+
Mandatory env vars:
|
| 5 |
+
API_BASE_URL The API endpoint for the LLM.
|
| 6 |
+
MODEL_NAME The model identifier to use for inference.
|
| 7 |
+
HF_TOKEN Your Hugging Face / API key.
|
| 8 |
+
|
| 9 |
+
STDOUT format (strictly followed):
|
| 10 |
+
[START] task=<task_name> env=<benchmark> model=<model_name>
|
| 11 |
+
[STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
|
| 12 |
+
[END] success=<true|false> steps=<n> score=<score> rewards=<r1,r2,...,rn>
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import json
|
| 16 |
+
import os
|
| 17 |
+
import sys
|
| 18 |
+
import textwrap
|
| 19 |
+
from typing import Any, Dict, List, Optional
|
| 20 |
+
|
| 21 |
+
from openai import OpenAI
|
| 22 |
+
|
| 23 |
+
sys.path.insert(0, os.path.dirname(__file__))
|
| 24 |
+
from env import CodeReviewEnv, TASK_IDS
|
| 25 |
+
from models import ReviewAction
|
| 26 |
+
|
| 27 |
+
# ββ Env vars ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 28 |
+
API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 29 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
|
| 30 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
|
| 31 |
+
BENCHMARK = "code-review-env"
|
| 32 |
+
SUCCESS_SCORE_THRESHOLD = 0.5
|
| 33 |
+
|
| 34 |
+
# ββ Logging helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 35 |
+
|
| 36 |
+
def log_start(task: str, env: str, model: str) -> None:
|
| 37 |
+
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
|
| 41 |
+
error_val = error if error else "null"
|
| 42 |
+
done_val = str(done).lower()
|
| 43 |
+
action_clean = action.replace("\n", " ").replace("\r", "")[:120]
|
| 44 |
+
print(
|
| 45 |
+
f"[STEP] step={step} action={action_clean} reward={reward:.2f} done={done_val} error={error_val}",
|
| 46 |
+
flush=True,
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
|
| 51 |
+
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 52 |
+
print(
|
| 53 |
+
f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}",
|
| 54 |
+
flush=True,
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
# ββ Prompts βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 59 |
+
|
| 60 |
+
SYSTEM_PROMPT = textwrap.dedent("""
|
| 61 |
+
You are an expert software engineer performing a thorough code review.
|
| 62 |
+
Your job is to:
|
| 63 |
+
1. Identify ALL bugs, security vulnerabilities, performance issues, and logic errors.
|
| 64 |
+
2. For each issue, output a JSON action with action_type="review".
|
| 65 |
+
3. After identifying all issues, output a patch with action_type="patch".
|
| 66 |
+
4. Finally, output action_type="submit" with your verdict.
|
| 67 |
+
|
| 68 |
+
Each response must be a single valid JSON object. No markdown, no explanation outside JSON.
|
| 69 |
+
|
| 70 |
+
Schema:
|
| 71 |
+
{
|
| 72 |
+
"action_type": "review" | "patch" | "comment" | "submit",
|
| 73 |
+
"severity": "critical" | "major" | "minor" | "info",
|
| 74 |
+
"issue_type": "bug" | "security" | "performance" | "logic" | "style",
|
| 75 |
+
"line_number": <int or null>,
|
| 76 |
+
"description": "<description of the issue>",
|
| 77 |
+
"patched_code": "<full corrected code>",
|
| 78 |
+
"comment": "<optional>",
|
| 79 |
+
"verdict": "approve" | "request_changes" | "reject",
|
| 80 |
+
"confidence": <0.0-1.0>
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
Output ONE JSON object per response. Be precise and thorough.
|
| 84 |
+
""").strip()
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def build_user_prompt(obs: Dict[str, Any]) -> str:
|
| 88 |
+
ctx = obs["review_context"]
|
| 89 |
+
files_text = "\n\n".join(
|
| 90 |
+
f"=== {f['filename']} ({f['language']}) ===\n{f['content']}"
|
| 91 |
+
for f in ctx["files_changed"]
|
| 92 |
+
)
|
| 93 |
+
issues_so_far = obs.get("issues_found_so_far", [])
|
| 94 |
+
|
| 95 |
+
prompt = textwrap.dedent(f"""
|
| 96 |
+
Pull Request: {ctx['pull_request_title']}
|
| 97 |
+
Author: {ctx['author']}
|
| 98 |
+
Description: {ctx['description']}
|
| 99 |
+
Linter: {ctx.get('linter_output', 'N/A')}
|
| 100 |
+
Tests: {ctx.get('test_results', 'N/A')}
|
| 101 |
+
|
| 102 |
+
--- CODE ---
|
| 103 |
+
{files_text}
|
| 104 |
+
--- END CODE ---
|
| 105 |
+
|
| 106 |
+
Step: {obs['step']} / {obs['max_steps']}
|
| 107 |
+
Issues reported so far: {len(issues_so_far)}
|
| 108 |
+
""").strip()
|
| 109 |
+
|
| 110 |
+
if issues_so_far:
|
| 111 |
+
prompt += "\n\nIssues already reported (do NOT repeat these):"
|
| 112 |
+
for iss in issues_so_far:
|
| 113 |
+
prompt += f"\n - [{iss.get('severity','?')}] line {iss.get('line','?')}: {iss.get('description','')}"
|
| 114 |
+
|
| 115 |
+
steps_left = obs['max_steps'] - obs['step']
|
| 116 |
+
if steps_left <= 2:
|
| 117 |
+
prompt += "\n\nYou are almost out of steps. Submit your patch and verdict NOW."
|
| 118 |
+
elif obs['step'] == 0:
|
| 119 |
+
prompt += "\n\nBegin your review. Output your first action as JSON."
|
| 120 |
+
else:
|
| 121 |
+
prompt += "\n\nContinue reviewing or submit if done. Output next action as JSON."
|
| 122 |
+
|
| 123 |
+
return prompt
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
# ββ JSON extraction βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 127 |
+
|
| 128 |
+
def extract_json(text: str) -> Dict[str, Any]:
|
| 129 |
+
text = text.strip()
|
| 130 |
+
if text.startswith("```"):
|
| 131 |
+
lines = text.split("\n")
|
| 132 |
+
text = "\n".join(lines[1:-1]) if len(lines) > 2 else text
|
| 133 |
+
try:
|
| 134 |
+
return json.loads(text)
|
| 135 |
+
except json.JSONDecodeError:
|
| 136 |
+
pass
|
| 137 |
+
start = text.find("{")
|
| 138 |
+
if start == -1:
|
| 139 |
+
raise ValueError("No JSON object found in response")
|
| 140 |
+
depth = 0
|
| 141 |
+
for i, ch in enumerate(text[start:], start):
|
| 142 |
+
if ch == "{":
|
| 143 |
+
depth += 1
|
| 144 |
+
elif ch == "}":
|
| 145 |
+
depth -= 1
|
| 146 |
+
if depth == 0:
|
| 147 |
+
return json.loads(text[start:i + 1])
|
| 148 |
+
raise ValueError("Unbalanced JSON in response")
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
# ββ Episode runner ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 152 |
+
|
| 153 |
+
def run_episode(client: OpenAI, task_id: str) -> Dict[str, Any]:
|
| 154 |
+
env = CodeReviewEnv()
|
| 155 |
+
obs_obj = env.reset(task_id)
|
| 156 |
+
obs = obs_obj.model_dump()
|
| 157 |
+
|
| 158 |
+
log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
|
| 159 |
+
|
| 160 |
+
rewards: List[float] = []
|
| 161 |
+
steps_taken = 0
|
| 162 |
+
score = 0.0
|
| 163 |
+
success = False
|
| 164 |
+
history: List[Dict[str, str]] = []
|
| 165 |
+
patch_submitted = False
|
| 166 |
+
error_msg: Optional[str] = None
|
| 167 |
+
|
| 168 |
+
try:
|
| 169 |
+
for step in range(1, obs_obj.max_steps + 1):
|
| 170 |
+
if obs.get("done"):
|
| 171 |
+
break
|
| 172 |
+
|
| 173 |
+
error_msg = None
|
| 174 |
+
steps_left = obs["max_steps"] - obs["step"]
|
| 175 |
+
|
| 176 |
+
# Force patch then submit near step limit
|
| 177 |
+
if steps_left <= 1 and not patch_submitted:
|
| 178 |
+
action_dict = {
|
| 179 |
+
"action_type": "patch",
|
| 180 |
+
"patched_code": obs["review_context"]["files_changed"][0]["content"],
|
| 181 |
+
}
|
| 182 |
+
elif steps_left <= 0:
|
| 183 |
+
action_dict = {
|
| 184 |
+
"action_type": "submit",
|
| 185 |
+
"verdict": "request_changes",
|
| 186 |
+
"confidence": 0.5,
|
| 187 |
+
}
|
| 188 |
+
else:
|
| 189 |
+
user_msg = build_user_prompt(obs)
|
| 190 |
+
history.append({"role": "user", "content": user_msg})
|
| 191 |
+
|
| 192 |
+
try:
|
| 193 |
+
completion = client.chat.completions.create(
|
| 194 |
+
model=MODEL_NAME,
|
| 195 |
+
messages=[{"role": "system", "content": SYSTEM_PROMPT}] + history,
|
| 196 |
+
max_tokens=1024,
|
| 197 |
+
temperature=0.2,
|
| 198 |
+
stream=False,
|
| 199 |
+
)
|
| 200 |
+
raw = (completion.choices[0].message.content or "").strip()
|
| 201 |
+
history.append({"role": "assistant", "content": raw})
|
| 202 |
+
action_dict = extract_json(raw)
|
| 203 |
+
except Exception as exc:
|
| 204 |
+
error_msg = str(exc)[:80]
|
| 205 |
+
action_dict = {
|
| 206 |
+
"action_type": "submit",
|
| 207 |
+
"verdict": "request_changes",
|
| 208 |
+
"confidence": 0.3,
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
if action_dict.get("action_type") == "patch":
|
| 212 |
+
patch_submitted = True
|
| 213 |
+
|
| 214 |
+
# Validate action
|
| 215 |
+
try:
|
| 216 |
+
action = ReviewAction(**action_dict)
|
| 217 |
+
except Exception as exc:
|
| 218 |
+
error_msg = str(exc)[:80]
|
| 219 |
+
action = ReviewAction(
|
| 220 |
+
action_type="submit",
|
| 221 |
+
verdict="request_changes",
|
| 222 |
+
confidence=0.3,
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
# Step environment
|
| 226 |
+
obs_obj, reward_obj, done, info = env.step(action)
|
| 227 |
+
obs = obs_obj.model_dump()
|
| 228 |
+
|
| 229 |
+
reward = reward_obj.value
|
| 230 |
+
rewards.append(reward)
|
| 231 |
+
steps_taken = step
|
| 232 |
+
|
| 233 |
+
action_summary = f"{action.action_type}:{(action.description or action.verdict or '')[:60]}"
|
| 234 |
+
log_step(step=step, action=action_summary, reward=reward, done=done, error=error_msg)
|
| 235 |
+
|
| 236 |
+
if done:
|
| 237 |
+
score = info.get("final_score", 0.0)
|
| 238 |
+
break
|
| 239 |
+
|
| 240 |
+
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 241 |
+
|
| 242 |
+
finally:
|
| 243 |
+
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
|
| 244 |
+
|
| 245 |
+
return {"task_id": task_id, "score": score, "steps": steps_taken, "success": success}
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
# ββ Main ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 249 |
+
|
| 250 |
+
def main() -> None:
|
| 251 |
+
if not API_KEY:
|
| 252 |
+
print("[ERROR] HF_TOKEN environment variable not set.", flush=True)
|
| 253 |
+
sys.exit(1)
|
| 254 |
+
|
| 255 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
|
| 256 |
+
|
| 257 |
+
task_ids = os.getenv("TASK_IDS", ",".join(TASK_IDS)).split(",")
|
| 258 |
+
task_ids = [t.strip() for t in task_ids if t.strip()]
|
| 259 |
+
|
| 260 |
+
all_results = []
|
| 261 |
+
for task_id in task_ids:
|
| 262 |
+
result = run_episode(client, task_id)
|
| 263 |
+
all_results.append(result)
|
| 264 |
+
|
| 265 |
+
# Aggregate summary to stderr so it doesn't pollute stdout log format
|
| 266 |
+
print("\n[SUMMARY]", file=sys.stderr)
|
| 267 |
+
for r in all_results:
|
| 268 |
+
print(f" {r['task_id']}: score={r['score']:.3f} steps={r['steps']} success={r['success']}", file=sys.stderr)
|
| 269 |
+
if all_results:
|
| 270 |
+
avg = sum(r["score"] for r in all_results) / len(all_results)
|
| 271 |
+
print(f" aggregate: {avg:.3f}", file=sys.stderr)
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
if __name__ == "__main__":
|
| 275 |
+
main()
|
models.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
OpenEnv-compliant Pydantic models for the Code Review Environment.
|
| 3 |
+
"""
|
| 4 |
+
from __future__ import annotations
|
| 5 |
+
from typing import Any, Dict, List, Literal, Optional
|
| 6 |
+
from pydantic import BaseModel, Field
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
# βββ Action Space ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 10 |
+
|
| 11 |
+
class ReviewAction(BaseModel):
|
| 12 |
+
"""Agent action: review and optionally patch code."""
|
| 13 |
+
action_type: Literal["review", "patch", "comment", "submit"] = Field(
|
| 14 |
+
description="Type of action the agent takes."
|
| 15 |
+
)
|
| 16 |
+
# For 'review': provide a structured analysis
|
| 17 |
+
severity: Optional[Literal["critical", "major", "minor", "info"]] = None
|
| 18 |
+
issue_type: Optional[str] = Field(
|
| 19 |
+
default=None,
|
| 20 |
+
description="Category: bug, security, performance, style, logic"
|
| 21 |
+
)
|
| 22 |
+
line_number: Optional[int] = Field(default=None, ge=1)
|
| 23 |
+
description: Optional[str] = Field(default=None, max_length=500)
|
| 24 |
+
|
| 25 |
+
# For 'patch': provide fixed code
|
| 26 |
+
patched_code: Optional[str] = Field(
|
| 27 |
+
default=None,
|
| 28 |
+
description="Full corrected code (for patch actions)."
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
# For 'comment': free-form annotation
|
| 32 |
+
comment: Optional[str] = Field(default=None, max_length=1000)
|
| 33 |
+
|
| 34 |
+
# For 'submit': final verdict
|
| 35 |
+
verdict: Optional[Literal["approve", "request_changes", "reject"]] = None
|
| 36 |
+
confidence: Optional[float] = Field(default=None, ge=0.0, le=1.0)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
# βββ Observation Space βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 40 |
+
|
| 41 |
+
class CodeFile(BaseModel):
|
| 42 |
+
filename: str
|
| 43 |
+
language: str
|
| 44 |
+
content: str
|
| 45 |
+
line_count: int
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
class ReviewContext(BaseModel):
|
| 49 |
+
pull_request_title: str
|
| 50 |
+
author: str
|
| 51 |
+
description: str
|
| 52 |
+
files_changed: List[CodeFile]
|
| 53 |
+
test_results: Optional[str] = None
|
| 54 |
+
linter_output: Optional[str] = None
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
class Observation(BaseModel):
|
| 58 |
+
"""What the agent sees at each step."""
|
| 59 |
+
task_id: str
|
| 60 |
+
step: int
|
| 61 |
+
max_steps: int
|
| 62 |
+
review_context: ReviewContext
|
| 63 |
+
previous_actions: List[ReviewAction] = Field(default_factory=list)
|
| 64 |
+
feedback: Optional[str] = None
|
| 65 |
+
issues_found_so_far: List[Dict[str, Any]] = Field(default_factory=list)
|
| 66 |
+
score_so_far: float = 0.0
|
| 67 |
+
done: bool = False
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
# βββ Reward Model ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 71 |
+
|
| 72 |
+
class StepReward(BaseModel):
|
| 73 |
+
"""Reward signal returned at each step."""
|
| 74 |
+
value: float = Field(ge=-1.0, le=1.0)
|
| 75 |
+
breakdown: Dict[str, float] = Field(default_factory=dict)
|
| 76 |
+
explanation: str = ""
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
# βββ State βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 80 |
+
|
| 81 |
+
class EnvironmentState(BaseModel):
|
| 82 |
+
task_id: str
|
| 83 |
+
step: int
|
| 84 |
+
max_steps: int
|
| 85 |
+
review_context: ReviewContext
|
| 86 |
+
actions_taken: List[ReviewAction] = Field(default_factory=list)
|
| 87 |
+
issues_identified: List[Dict[str, Any]] = Field(default_factory=list)
|
| 88 |
+
patch_submitted: Optional[str] = None
|
| 89 |
+
verdict_submitted: Optional[str] = None
|
| 90 |
+
total_reward: float = 0.0
|
| 91 |
+
done: bool = False
|
| 92 |
+
terminated_reason: Optional[str] = None
|
openenv.yaml
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: code-review-env
|
| 2 |
+
version: "1.0.0"
|
| 3 |
+
spec: openenv/v1
|
| 4 |
+
tags:
|
| 5 |
+
- openenv
|
| 6 |
+
- code-review
|
| 7 |
+
- software-engineering
|
| 8 |
+
- security
|
| 9 |
+
- agent-evaluation
|
| 10 |
+
|
| 11 |
+
description: >
|
| 12 |
+
A code review environment where AI agents act as senior engineers reviewing
|
| 13 |
+
pull requests. Tasks span bug hunting (easy), security auditing (medium),
|
| 14 |
+
and distributed systems correctness review (hard). Fully OpenEnv-compliant
|
| 15 |
+
with typed Pydantic models, dense reward signals, and programmatic graders.
|
| 16 |
+
|
| 17 |
+
author: "Meta Hackathon Submission"
|
| 18 |
+
license: MIT
|
| 19 |
+
|
| 20 |
+
observation_space:
|
| 21 |
+
type: object
|
| 22 |
+
description: >
|
| 23 |
+
Structured pull request context including code files, linter output,
|
| 24 |
+
test results, and history of previous actions taken in the episode.
|
| 25 |
+
fields:
|
| 26 |
+
- task_id: string
|
| 27 |
+
- step: integer
|
| 28 |
+
- max_steps: integer
|
| 29 |
+
- review_context: ReviewContext
|
| 30 |
+
- previous_actions: list[ReviewAction]
|
| 31 |
+
- issues_found_so_far: list[dict]
|
| 32 |
+
- score_so_far: float [0.0, 1.0]
|
| 33 |
+
- done: boolean
|
| 34 |
+
|
| 35 |
+
action_space:
|
| 36 |
+
type: object
|
| 37 |
+
description: >
|
| 38 |
+
Agents may review (annotate an issue), patch (submit corrected code),
|
| 39 |
+
comment (free-form annotation), or submit (final verdict).
|
| 40 |
+
action_types:
|
| 41 |
+
- review: annotate a specific issue with severity, type, line, and description
|
| 42 |
+
- patch: provide full corrected code
|
| 43 |
+
- comment: free-form annotation
|
| 44 |
+
- submit: final verdict (approve | request_changes | reject) with confidence
|
| 45 |
+
|
| 46 |
+
reward:
|
| 47 |
+
type: dense
|
| 48 |
+
range: [-1.0, 1.0]
|
| 49 |
+
description: >
|
| 50 |
+
Intermediate reward encourages efficient, non-repetitive, actionable reviews.
|
| 51 |
+
Final reward (at submit or max_steps) is the programmatic grader score in [0.0, 1.0].
|
| 52 |
+
components:
|
| 53 |
+
step_penalty: -0.01 per step (encourages efficiency)
|
| 54 |
+
review_description_bonus: +0.05 for substantive review action
|
| 55 |
+
critical_severity_bonus: +0.03 for marking an issue as critical
|
| 56 |
+
patch_submitted_bonus: +0.10 for submitting non-trivial patch
|
| 57 |
+
repetition_penalty: -0.05 for repeating identical descriptions
|
| 58 |
+
|
| 59 |
+
tasks:
|
| 60 |
+
- id: task_1_easy_bug_hunt
|
| 61 |
+
difficulty: easy
|
| 62 |
+
max_steps: 8
|
| 63 |
+
description: >
|
| 64 |
+
Find three planted bugs in a Python utility module:
|
| 65 |
+
assignment-instead-of-comparison, off-by-one loop bound, missing return.
|
| 66 |
+
grader: keyword-match + AST parse of patch
|
| 67 |
+
max_score: 1.0
|
| 68 |
+
|
| 69 |
+
- id: task_2_medium_security
|
| 70 |
+
difficulty: medium
|
| 71 |
+
max_steps: 12
|
| 72 |
+
description: >
|
| 73 |
+
Audit a Flask authentication endpoint for six security vulnerabilities:
|
| 74 |
+
SQL injection (Γ2), plaintext passwords, no rate limiting,
|
| 75 |
+
sensitive data leakage, hardcoded secret key.
|
| 76 |
+
grader: keyword-match across action descriptions + patch structural check
|
| 77 |
+
max_score: 1.0
|
| 78 |
+
|
| 79 |
+
- id: task_3_hard_perf_correctness
|
| 80 |
+
difficulty: hard
|
| 81 |
+
max_steps: 16
|
| 82 |
+
description: >
|
| 83 |
+
Review a distributed LRU cache backed by Redis for six issues:
|
| 84 |
+
race condition, memory leak, N+1 query, wrong LRU order,
|
| 85 |
+
thread-safety violation, pickle deserialization exploit.
|
| 86 |
+
grader: keyword-match + patch structural check (Lock, OrderedDict, mget, json)
|
| 87 |
+
max_score: 1.0
|
| 88 |
+
|
| 89 |
+
baseline_scores:
|
| 90 |
+
model: Qwen/Qwen2.5-72B-Instruct
|
| 91 |
+
task_1_easy_bug_hunt: 0.72
|
| 92 |
+
task_2_medium_security: 0.55
|
| 93 |
+
task_3_hard_perf_correctness: 0.38
|
| 94 |
+
aggregate: 0.55
|
| 95 |
+
|
| 96 |
+
deployment:
|
| 97 |
+
platform: huggingface_spaces
|
| 98 |
+
sdk: docker
|
| 99 |
+
port: 7860
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi==0.111.0
|
| 2 |
+
uvicorn[standard]==0.29.0
|
| 3 |
+
pydantic==2.7.1
|
| 4 |
+
openai>=1.30.0
|
| 5 |
+
requests>=2.31.0
|
| 6 |
+
pyyaml>=6.0
|
server.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
FastAPI server exposing the CodeReviewEnv as an HTTP API.
|
| 3 |
+
Endpoints mirror the OpenEnv interface: /reset, /step, /state, /tasks.
|
| 4 |
+
"""
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
|
| 7 |
+
import sys
|
| 8 |
+
import os
|
| 9 |
+
sys.path.insert(0, os.path.dirname(__file__))
|
| 10 |
+
|
| 11 |
+
from typing import Any, Dict
|
| 12 |
+
import uuid
|
| 13 |
+
|
| 14 |
+
from fastapi import FastAPI, HTTPException
|
| 15 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 16 |
+
from pydantic import BaseModel
|
| 17 |
+
|
| 18 |
+
from env import CodeReviewEnv, TASK_IDS
|
| 19 |
+
from models import ReviewAction, Observation, StepReward, EnvironmentState
|
| 20 |
+
|
| 21 |
+
app = FastAPI(
|
| 22 |
+
title="CodeReviewEnv",
|
| 23 |
+
description="OpenEnv-compliant environment for AI code review agents.",
|
| 24 |
+
version="1.0.0",
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
app.add_middleware(
|
| 28 |
+
CORSMiddleware,
|
| 29 |
+
allow_origins=["*"],
|
| 30 |
+
allow_methods=["*"],
|
| 31 |
+
allow_headers=["*"],
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
# ββ Session store (in-memory, single process) ββββββββββββββββββββββββββββββββ
|
| 35 |
+
_sessions: Dict[str, CodeReviewEnv] = {}
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# ββ Request / Response models βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 39 |
+
|
| 40 |
+
class ResetRequest(BaseModel):
|
| 41 |
+
task_id: str = "task_1_easy_bug_hunt"
|
| 42 |
+
session_id: str | None = None
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class ResetResponse(BaseModel):
|
| 46 |
+
session_id: str
|
| 47 |
+
observation: Observation
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class StepRequest(BaseModel):
|
| 51 |
+
session_id: str
|
| 52 |
+
action: ReviewAction
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
class StepResponse(BaseModel):
|
| 56 |
+
observation: Observation
|
| 57 |
+
reward: StepReward
|
| 58 |
+
done: bool
|
| 59 |
+
info: Dict[str, Any]
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
# ββ Routes ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 63 |
+
|
| 64 |
+
@app.get("/")
|
| 65 |
+
def root():
|
| 66 |
+
return {
|
| 67 |
+
"name": "CodeReviewEnv",
|
| 68 |
+
"version": "1.0.0",
|
| 69 |
+
"tasks": TASK_IDS,
|
| 70 |
+
"spec": "OpenEnv v1",
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
@app.get("/tasks")
|
| 75 |
+
def list_tasks():
|
| 76 |
+
from tasks.task1_easy import get_task_config as t1
|
| 77 |
+
from tasks.task2_medium import get_task_config as t2
|
| 78 |
+
from tasks.task3_hard import get_task_config as t3
|
| 79 |
+
tasks = []
|
| 80 |
+
for fn in (t1, t2, t3):
|
| 81 |
+
cfg = fn()
|
| 82 |
+
tasks.append({
|
| 83 |
+
"task_id": cfg["task_id"],
|
| 84 |
+
"difficulty": cfg["difficulty"],
|
| 85 |
+
"description": cfg["description"],
|
| 86 |
+
})
|
| 87 |
+
return {"tasks": tasks}
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
@app.post("/reset", response_model=ResetResponse)
|
| 91 |
+
def reset(req: ResetRequest):
|
| 92 |
+
if req.task_id not in TASK_IDS:
|
| 93 |
+
raise HTTPException(400, f"Unknown task_id {req.task_id!r}. Choose from {TASK_IDS}")
|
| 94 |
+
session_id = req.session_id or str(uuid.uuid4())
|
| 95 |
+
env = CodeReviewEnv()
|
| 96 |
+
obs = env.reset(req.task_id)
|
| 97 |
+
_sessions[session_id] = env
|
| 98 |
+
return ResetResponse(session_id=session_id, observation=obs)
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
@app.post("/step", response_model=StepResponse)
|
| 102 |
+
def step(req: StepRequest):
|
| 103 |
+
env = _sessions.get(req.session_id)
|
| 104 |
+
if env is None:
|
| 105 |
+
raise HTTPException(404, f"Session {req.session_id!r} not found. Call /reset first.")
|
| 106 |
+
obs, reward, done, info = env.step(req.action)
|
| 107 |
+
return StepResponse(observation=obs, reward=reward, done=done, info=info)
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
@app.get("/state/{session_id}", response_model=EnvironmentState)
|
| 111 |
+
def get_state(session_id: str):
|
| 112 |
+
env = _sessions.get(session_id)
|
| 113 |
+
if env is None:
|
| 114 |
+
raise HTTPException(404, f"Session {session_id!r} not found.")
|
| 115 |
+
return env.state()
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
@app.delete("/session/{session_id}")
|
| 119 |
+
def delete_session(session_id: str):
|
| 120 |
+
_sessions.pop(session_id, None)
|
| 121 |
+
return {"deleted": session_id}
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
if __name__ == "__main__":
|
| 125 |
+
import uvicorn
|
| 126 |
+
uvicorn.run("server:app", host="0.0.0.0", port=7860, reload=False)
|
tasks/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# tasks package
|
tasks/task1_easy.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Task 1 (Easy): Bug Identification in a Simple Python Utility.
|
| 3 |
+
|
| 4 |
+
The agent reviews a short Python module with 3 clearly planted bugs:
|
| 5 |
+
1. Off-by-one error in a loop
|
| 6 |
+
2. Incorrect comparison operator (= vs ==)
|
| 7 |
+
3. Missing return statement in a branch
|
| 8 |
+
"""
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
from typing import Any, Dict
|
| 11 |
+
|
| 12 |
+
TASK_ID = "task_1_easy_bug_hunt"
|
| 13 |
+
MAX_STEPS = 8
|
| 14 |
+
|
| 15 |
+
BUGGY_CODE = '''\
|
| 16 |
+
def find_max(numbers: list) -> int:
|
| 17 |
+
"""Return the maximum value in a non-empty list."""
|
| 18 |
+
if len(numbers) = 0: # BUG 1: assignment instead of == comparison
|
| 19 |
+
raise ValueError("List is empty")
|
| 20 |
+
max_val = numbers[0]
|
| 21 |
+
for i in range(1, len(numbers) + 1): # BUG 2: off-by-one, should be len(numbers)
|
| 22 |
+
if numbers[i] > max_val:
|
| 23 |
+
max_val = numbers[i]
|
| 24 |
+
# BUG 3: missing return statement β falls off the end returning None
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def calculate_average(numbers: list) -> float:
|
| 28 |
+
"""Return the arithmetic mean of a list of numbers."""
|
| 29 |
+
if not numbers:
|
| 30 |
+
raise ValueError("Cannot average empty list")
|
| 31 |
+
total = 0
|
| 32 |
+
for n in numbers:
|
| 33 |
+
total += n
|
| 34 |
+
return total / len(numbers)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def is_palindrome(s: str) -> bool:
|
| 38 |
+
"""Check whether a string is a palindrome (case-insensitive)."""
|
| 39 |
+
cleaned = s.lower().replace(" ", "")
|
| 40 |
+
return cleaned == cleaned[::-1]
|
| 41 |
+
'''
|
| 42 |
+
|
| 43 |
+
FIXED_CODE = '''\
|
| 44 |
+
def find_max(numbers: list) -> int:
|
| 45 |
+
"""Return the maximum value in a non-empty list."""
|
| 46 |
+
if len(numbers) == 0:
|
| 47 |
+
raise ValueError("List is empty")
|
| 48 |
+
max_val = numbers[0]
|
| 49 |
+
for i in range(1, len(numbers)):
|
| 50 |
+
if numbers[i] > max_val:
|
| 51 |
+
max_val = numbers[i]
|
| 52 |
+
return max_val
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def calculate_average(numbers: list) -> float:
|
| 56 |
+
"""Return the arithmetic mean of a list of numbers."""
|
| 57 |
+
if not numbers:
|
| 58 |
+
raise ValueError("Cannot average empty list")
|
| 59 |
+
total = 0
|
| 60 |
+
for n in numbers:
|
| 61 |
+
total += n
|
| 62 |
+
return total / len(numbers)
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def is_palindrome(s: str) -> bool:
|
| 66 |
+
"""Check whether a string is a palindrome (case-insensitive)."""
|
| 67 |
+
cleaned = s.lower().replace(" ", "")
|
| 68 |
+
return cleaned == cleaned[::-1]
|
| 69 |
+
'''
|
| 70 |
+
|
| 71 |
+
KNOWN_BUGS = {
|
| 72 |
+
"bug_comparison_operator": {
|
| 73 |
+
"line": 3,
|
| 74 |
+
"description_keywords": ["assignment", "comparison", "==", "=", "operator"],
|
| 75 |
+
"severity": "critical",
|
| 76 |
+
"issue_type": "bug",
|
| 77 |
+
},
|
| 78 |
+
"bug_off_by_one": {
|
| 79 |
+
"line": 6,
|
| 80 |
+
"description_keywords": ["off-by-one", "index", "range", "len", "+1", "IndexError"],
|
| 81 |
+
"severity": "critical",
|
| 82 |
+
"issue_type": "bug",
|
| 83 |
+
},
|
| 84 |
+
"bug_missing_return": {
|
| 85 |
+
"line": 9,
|
| 86 |
+
"description_keywords": ["return", "None", "missing", "falls off"],
|
| 87 |
+
"severity": "major",
|
| 88 |
+
"issue_type": "bug",
|
| 89 |
+
},
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
PULL_REQUEST = {
|
| 93 |
+
"pull_request_title": "Add utility functions: find_max, calculate_average, is_palindrome",
|
| 94 |
+
"author": "dev-intern",
|
| 95 |
+
"description": (
|
| 96 |
+
"Implements three utility functions for list and string operations. "
|
| 97 |
+
"Please review for correctness before merging."
|
| 98 |
+
),
|
| 99 |
+
"files_changed": [
|
| 100 |
+
{
|
| 101 |
+
"filename": "utils.py",
|
| 102 |
+
"language": "python",
|
| 103 |
+
"content": BUGGY_CODE,
|
| 104 |
+
"line_count": BUGGY_CODE.count("\n") + 1,
|
| 105 |
+
}
|
| 106 |
+
],
|
| 107 |
+
"test_results": "No tests provided.",
|
| 108 |
+
"linter_output": "SyntaxError detected on line 3 (invalid syntax).",
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def get_task_config() -> Dict[str, Any]:
|
| 113 |
+
return {
|
| 114 |
+
"task_id": TASK_ID,
|
| 115 |
+
"max_steps": MAX_STEPS,
|
| 116 |
+
"pull_request": PULL_REQUEST,
|
| 117 |
+
"known_bugs": KNOWN_BUGS,
|
| 118 |
+
"fixed_code": FIXED_CODE,
|
| 119 |
+
"difficulty": "easy",
|
| 120 |
+
"description": (
|
| 121 |
+
"Review a short Python utility module. "
|
| 122 |
+
"Find and describe all bugs, then submit a patched version."
|
| 123 |
+
),
|
| 124 |
+
}
|
tasks/task2_medium.py
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Task 2 (Medium): Security Vulnerability Review in a Flask Web Endpoint.
|
| 3 |
+
|
| 4 |
+
The agent reviews a Flask user-authentication endpoint containing:
|
| 5 |
+
1. SQL injection vulnerability (string formatting into query)
|
| 6 |
+
2. Plaintext password storage (no hashing)
|
| 7 |
+
3. Missing rate limiting / brute-force protection
|
| 8 |
+
4. Sensitive data leakage in error response
|
| 9 |
+
5. Hardcoded secret key
|
| 10 |
+
"""
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
from typing import Any, Dict
|
| 13 |
+
|
| 14 |
+
TASK_ID = "task_2_medium_security"
|
| 15 |
+
MAX_STEPS = 12
|
| 16 |
+
|
| 17 |
+
BUGGY_CODE = '''\
|
| 18 |
+
import sqlite3
|
| 19 |
+
from flask import Flask, request, jsonify
|
| 20 |
+
|
| 21 |
+
app = Flask(__name__)
|
| 22 |
+
app.secret_key = "supersecret123" # VULN 5: hardcoded secret key
|
| 23 |
+
|
| 24 |
+
DB_PATH = "users.db"
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def get_db():
|
| 28 |
+
return sqlite3.connect(DB_PATH)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
@app.route("/login", methods=["POST"])
|
| 32 |
+
def login():
|
| 33 |
+
username = request.json.get("username")
|
| 34 |
+
password = request.json.get("password")
|
| 35 |
+
|
| 36 |
+
db = get_db()
|
| 37 |
+
cursor = db.cursor()
|
| 38 |
+
|
| 39 |
+
# VULN 1: SQL injection β user input directly interpolated into query
|
| 40 |
+
query = f"SELECT * FROM users WHERE username = \'{username}\' AND password = \'{password}\'"
|
| 41 |
+
cursor.execute(query)
|
| 42 |
+
user = cursor.fetchone()
|
| 43 |
+
|
| 44 |
+
if user:
|
| 45 |
+
return jsonify({"status": "ok", "user_id": user[0], "email": user[2]})
|
| 46 |
+
else:
|
| 47 |
+
# VULN 4: leaks whether username exists or password is wrong
|
| 48 |
+
cursor.execute(f"SELECT id FROM users WHERE username = \'{username}\'")
|
| 49 |
+
exists = cursor.fetchone()
|
| 50 |
+
if exists:
|
| 51 |
+
return jsonify({"error": f"Wrong password for user {username}"}), 401
|
| 52 |
+
return jsonify({"error": f"User {username} does not exist"}), 404
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
@app.route("/register", methods=["POST"])
|
| 56 |
+
def register():
|
| 57 |
+
username = request.json.get("username")
|
| 58 |
+
password = request.json.get("password") # VULN 2: stored in plaintext
|
| 59 |
+
email = request.json.get("email")
|
| 60 |
+
|
| 61 |
+
db = get_db()
|
| 62 |
+
cursor = db.cursor()
|
| 63 |
+
# VULN 1 again: SQL injection in insert
|
| 64 |
+
cursor.execute(
|
| 65 |
+
f"INSERT INTO users (username, password, email) VALUES (\'{username}\', \'{password}\', \'{email}\')"
|
| 66 |
+
)
|
| 67 |
+
db.commit()
|
| 68 |
+
return jsonify({"status": "registered"})
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
# VULN 3: No rate limiting on login endpoint (brute-force possible)
|
| 72 |
+
|
| 73 |
+
if __name__ == "__main__":
|
| 74 |
+
app.run(debug=True)
|
| 75 |
+
'''
|
| 76 |
+
|
| 77 |
+
FIXED_CODE = '''\
|
| 78 |
+
import os
|
| 79 |
+
import sqlite3
|
| 80 |
+
from flask import Flask, request, jsonify
|
| 81 |
+
from werkzeug.security import generate_password_hash, check_password_hash
|
| 82 |
+
from flask_limiter import Limiter
|
| 83 |
+
from flask_limiter.util import get_remote_address
|
| 84 |
+
|
| 85 |
+
app = Flask(__name__)
|
| 86 |
+
app.secret_key = os.environ.get("SECRET_KEY") # read from env, never hardcode
|
| 87 |
+
|
| 88 |
+
limiter = Limiter(get_remote_address, app=app, default_limits=["200 per day", "50 per hour"])
|
| 89 |
+
|
| 90 |
+
DB_PATH = "users.db"
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def get_db():
|
| 94 |
+
return sqlite3.connect(DB_PATH)
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
@app.route("/login", methods=["POST"])
|
| 98 |
+
@limiter.limit("5 per minute") # brute-force protection
|
| 99 |
+
def login():
|
| 100 |
+
username = request.json.get("username")
|
| 101 |
+
password = request.json.get("password")
|
| 102 |
+
|
| 103 |
+
db = get_db()
|
| 104 |
+
cursor = db.cursor()
|
| 105 |
+
|
| 106 |
+
# Parameterised query β prevents SQL injection
|
| 107 |
+
cursor.execute("SELECT id, password_hash FROM users WHERE username = ?", (username,))
|
| 108 |
+
user = cursor.fetchone()
|
| 109 |
+
|
| 110 |
+
if user and check_password_hash(user[1], password):
|
| 111 |
+
return jsonify({"status": "ok", "user_id": user[0]})
|
| 112 |
+
# Generic error β does not reveal whether user exists
|
| 113 |
+
return jsonify({"error": "Invalid credentials"}), 401
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
@app.route("/register", methods=["POST"])
|
| 117 |
+
def register():
|
| 118 |
+
username = request.json.get("username")
|
| 119 |
+
password = request.json.get("password")
|
| 120 |
+
email = request.json.get("email")
|
| 121 |
+
|
| 122 |
+
db = get_db()
|
| 123 |
+
cursor = db.cursor()
|
| 124 |
+
password_hash = generate_password_hash(password)
|
| 125 |
+
cursor.execute(
|
| 126 |
+
"INSERT INTO users (username, password_hash, email) VALUES (?, ?, ?)",
|
| 127 |
+
(username, password_hash, email),
|
| 128 |
+
)
|
| 129 |
+
db.commit()
|
| 130 |
+
return jsonify({"status": "registered"})
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
if __name__ == "__main__":
|
| 134 |
+
app.run(debug=False)
|
| 135 |
+
'''
|
| 136 |
+
|
| 137 |
+
KNOWN_VULNERABILITIES = {
|
| 138 |
+
"sql_injection_login": {
|
| 139 |
+
"line": 23,
|
| 140 |
+
"description_keywords": ["sql injection", "parameterized", "f-string", "format", "interpolat", "query"],
|
| 141 |
+
"severity": "critical",
|
| 142 |
+
"issue_type": "security",
|
| 143 |
+
},
|
| 144 |
+
"sql_injection_register": {
|
| 145 |
+
"line": 44,
|
| 146 |
+
"description_keywords": ["sql injection", "parameterized", "f-string", "format", "interpolat", "insert"],
|
| 147 |
+
"severity": "critical",
|
| 148 |
+
"issue_type": "security",
|
| 149 |
+
},
|
| 150 |
+
"plaintext_password": {
|
| 151 |
+
"line": 39,
|
| 152 |
+
"description_keywords": ["plaintext", "hash", "bcrypt", "werkzeug", "password", "store"],
|
| 153 |
+
"severity": "critical",
|
| 154 |
+
"issue_type": "security",
|
| 155 |
+
},
|
| 156 |
+
"no_rate_limiting": {
|
| 157 |
+
"line": None,
|
| 158 |
+
"description_keywords": ["rate limit", "brute force", "throttl", "limiter"],
|
| 159 |
+
"severity": "major",
|
| 160 |
+
"issue_type": "security",
|
| 161 |
+
},
|
| 162 |
+
"sensitive_data_leak": {
|
| 163 |
+
"line": 30,
|
| 164 |
+
"description_keywords": ["leak", "enumerat", "username exist", "generic error", "information disclos"],
|
| 165 |
+
"severity": "major",
|
| 166 |
+
"issue_type": "security",
|
| 167 |
+
},
|
| 168 |
+
"hardcoded_secret": {
|
| 169 |
+
"line": 5,
|
| 170 |
+
"description_keywords": ["hardcode", "secret", "env", "environment variable", "secret_key"],
|
| 171 |
+
"severity": "major",
|
| 172 |
+
"issue_type": "security",
|
| 173 |
+
},
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
PULL_REQUEST = {
|
| 177 |
+
"pull_request_title": "Implement user login and registration API endpoints",
|
| 178 |
+
"author": "backend-dev",
|
| 179 |
+
"description": (
|
| 180 |
+
"Adds /login and /register REST endpoints backed by SQLite. "
|
| 181 |
+
"Ready for production review."
|
| 182 |
+
),
|
| 183 |
+
"files_changed": [
|
| 184 |
+
{
|
| 185 |
+
"filename": "auth.py",
|
| 186 |
+
"language": "python",
|
| 187 |
+
"content": BUGGY_CODE,
|
| 188 |
+
"line_count": BUGGY_CODE.count("\n") + 1,
|
| 189 |
+
}
|
| 190 |
+
],
|
| 191 |
+
"test_results": "Manual testing passed on happy path.",
|
| 192 |
+
"linter_output": "No linter warnings.",
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
def get_task_config() -> Dict[str, Any]:
|
| 197 |
+
return {
|
| 198 |
+
"task_id": TASK_ID,
|
| 199 |
+
"max_steps": MAX_STEPS,
|
| 200 |
+
"pull_request": PULL_REQUEST,
|
| 201 |
+
"known_vulnerabilities": KNOWN_VULNERABILITIES,
|
| 202 |
+
"fixed_code": FIXED_CODE,
|
| 203 |
+
"difficulty": "medium",
|
| 204 |
+
"description": (
|
| 205 |
+
"Review a Flask authentication endpoint for security vulnerabilities. "
|
| 206 |
+
"Identify all issues by category and severity, then provide a secure patched version."
|
| 207 |
+
),
|
| 208 |
+
}
|
tasks/task3_hard.py
ADDED
|
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Task 3 (Hard): Performance & Correctness Review of a Distributed LRU Cache.
|
| 3 |
+
|
| 4 |
+
The agent reviews a Python LRU cache with Redis backing containing:
|
| 5 |
+
1. Race condition (non-atomic check-then-act on Redis)
|
| 6 |
+
2. Memory leak (unbounded local dict grows forever)
|
| 7 |
+
3. N+1 query pattern (per-key pipeline not batched)
|
| 8 |
+
4. Incorrect LRU eviction (uses insertion order, not access order)
|
| 9 |
+
5. Thread-safety violation (shared dict without lock)
|
| 10 |
+
6. Silent data corruption (pickle loads untrusted bytes)
|
| 11 |
+
"""
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
from typing import Any, Dict
|
| 14 |
+
|
| 15 |
+
TASK_ID = "task_3_hard_perf_correctness"
|
| 16 |
+
MAX_STEPS = 16
|
| 17 |
+
|
| 18 |
+
BUGGY_CODE = '''\
|
| 19 |
+
import pickle
|
| 20 |
+
import threading
|
| 21 |
+
import redis
|
| 22 |
+
|
| 23 |
+
class DistributedLRUCache:
|
| 24 |
+
"""
|
| 25 |
+
LRU cache backed by Redis for distributed deployments.
|
| 26 |
+
Local dict acts as an L1 write-through layer.
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
def __init__(self, capacity: int, redis_url: str = "redis://localhost:6379"):
|
| 30 |
+
self.capacity = capacity
|
| 31 |
+
self.local = {} # ISSUE 2 & 5: shared dict, no lock, unbounded growth
|
| 32 |
+
self.redis = redis.from_url(redis_url)
|
| 33 |
+
self.hits = 0
|
| 34 |
+
self.misses = 0
|
| 35 |
+
|
| 36 |
+
# ββ ISSUE 5: no lock; concurrent writes race on self.local ββββββββββββββ
|
| 37 |
+
def get(self, key: str):
|
| 38 |
+
if key in self.local:
|
| 39 |
+
self.hits += 1
|
| 40 |
+
return self.local[key] # ISSUE 4: doesn't update LRU order
|
| 41 |
+
|
| 42 |
+
# ISSUE 1: race condition β between EXISTS and GET another process may delete key
|
| 43 |
+
if self.redis.exists(key):
|
| 44 |
+
raw = self.redis.get(key)
|
| 45 |
+
value = pickle.loads(raw) # ISSUE 6: deserialising untrusted bytes
|
| 46 |
+
self.local[key] = value # ISSUE 2: local dict grows without bound
|
| 47 |
+
self.hits += 1
|
| 48 |
+
return value
|
| 49 |
+
|
| 50 |
+
self.misses += 1
|
| 51 |
+
return None
|
| 52 |
+
|
| 53 |
+
def put(self, key: str, value, ttl: int = 300):
|
| 54 |
+
# ISSUE 2: no eviction from self.local; grows forever
|
| 55 |
+
self.local[key] = value
|
| 56 |
+
|
| 57 |
+
# ISSUE 1: non-atomic: set + expire are two separate commands
|
| 58 |
+
self.redis.set(key, pickle.dumps(value))
|
| 59 |
+
self.redis.expire(key, ttl)
|
| 60 |
+
|
| 61 |
+
def get_many(self, keys: list):
|
| 62 |
+
# ISSUE 3: N+1 β calls self.get() in a loop instead of using pipeline/mget
|
| 63 |
+
return {k: self.get(k) for k in keys}
|
| 64 |
+
|
| 65 |
+
def invalidate(self, key: str):
|
| 66 |
+
self.local.pop(key, None)
|
| 67 |
+
self.redis.delete(key)
|
| 68 |
+
|
| 69 |
+
def stats(self):
|
| 70 |
+
total = self.hits + self.misses
|
| 71 |
+
return {
|
| 72 |
+
"hits": self.hits,
|
| 73 |
+
"misses": self.misses,
|
| 74 |
+
"hit_rate": self.hits / total if total else 0,
|
| 75 |
+
"local_size": len(self.local),
|
| 76 |
+
}
|
| 77 |
+
'''
|
| 78 |
+
|
| 79 |
+
FIXED_CODE = '''\
|
| 80 |
+
import json
|
| 81 |
+
import threading
|
| 82 |
+
from collections import OrderedDict
|
| 83 |
+
import redis
|
| 84 |
+
|
| 85 |
+
class DistributedLRUCache:
|
| 86 |
+
"""
|
| 87 |
+
Thread-safe LRU cache backed by Redis.
|
| 88 |
+
Uses OrderedDict for correct LRU eviction, a Lock for thread safety,
|
| 89 |
+
atomic Redis SET EX commands, and mget for batch fetching.
|
| 90 |
+
Serialises with JSON (not pickle) to avoid arbitrary code execution.
|
| 91 |
+
"""
|
| 92 |
+
|
| 93 |
+
def __init__(self, capacity: int, redis_url: str = "redis://localhost:6379"):
|
| 94 |
+
self.capacity = capacity
|
| 95 |
+
self.local: OrderedDict = OrderedDict() # correct LRU order
|
| 96 |
+
self._lock = threading.Lock() # thread safety
|
| 97 |
+
self.redis = redis.from_url(redis_url)
|
| 98 |
+
self.hits = 0
|
| 99 |
+
self.misses = 0
|
| 100 |
+
|
| 101 |
+
def get(self, key: str):
|
| 102 |
+
with self._lock:
|
| 103 |
+
if key in self.local:
|
| 104 |
+
self.local.move_to_end(key) # update LRU order
|
| 105 |
+
self.hits += 1
|
| 106 |
+
return self.local[key]
|
| 107 |
+
|
| 108 |
+
raw = self.redis.get(key) # atomic single GET, no race
|
| 109 |
+
if raw is not None:
|
| 110 |
+
value = json.loads(raw) # safe deserialisation
|
| 111 |
+
with self._lock:
|
| 112 |
+
self._evict_if_needed()
|
| 113 |
+
self.local[key] = value
|
| 114 |
+
self.hits += 1
|
| 115 |
+
return value
|
| 116 |
+
|
| 117 |
+
with self._lock:
|
| 118 |
+
self.misses += 1
|
| 119 |
+
return None
|
| 120 |
+
|
| 121 |
+
def _evict_if_needed(self):
|
| 122 |
+
"""Call with self._lock held."""
|
| 123 |
+
while len(self.local) >= self.capacity:
|
| 124 |
+
self.local.popitem(last=False) # evict LRU item
|
| 125 |
+
|
| 126 |
+
def put(self, key: str, value, ttl: int = 300):
|
| 127 |
+
payload = json.dumps(value)
|
| 128 |
+
self.redis.set(key, payload, ex=ttl) # atomic SET with TTL
|
| 129 |
+
with self._lock:
|
| 130 |
+
self.local[key] = value
|
| 131 |
+
self.local.move_to_end(key)
|
| 132 |
+
self._evict_if_needed()
|
| 133 |
+
|
| 134 |
+
def get_many(self, keys: list):
|
| 135 |
+
"""Batch fetch using Redis MGET β O(1) round trips."""
|
| 136 |
+
if not keys:
|
| 137 |
+
return {}
|
| 138 |
+
raws = self.redis.mget(keys)
|
| 139 |
+
result = {}
|
| 140 |
+
with self._lock:
|
| 141 |
+
for key, raw in zip(keys, raws):
|
| 142 |
+
if raw is not None:
|
| 143 |
+
value = json.loads(raw)
|
| 144 |
+
self._evict_if_needed()
|
| 145 |
+
self.local[key] = value
|
| 146 |
+
self.hits += 1
|
| 147 |
+
result[key] = value
|
| 148 |
+
else:
|
| 149 |
+
self.misses += 1
|
| 150 |
+
result[key] = None
|
| 151 |
+
return result
|
| 152 |
+
|
| 153 |
+
def invalidate(self, key: str):
|
| 154 |
+
with self._lock:
|
| 155 |
+
self.local.pop(key, None)
|
| 156 |
+
self.redis.delete(key)
|
| 157 |
+
|
| 158 |
+
def stats(self):
|
| 159 |
+
with self._lock:
|
| 160 |
+
total = self.hits + self.misses
|
| 161 |
+
return {
|
| 162 |
+
"hits": self.hits,
|
| 163 |
+
"misses": self.misses,
|
| 164 |
+
"hit_rate": self.hits / total if total else 0,
|
| 165 |
+
"local_size": len(self.local),
|
| 166 |
+
}
|
| 167 |
+
'''
|
| 168 |
+
|
| 169 |
+
KNOWN_ISSUES = {
|
| 170 |
+
"race_condition": {
|
| 171 |
+
"lines": [23, 43],
|
| 172 |
+
"description_keywords": ["race condition", "atomic", "exists", "set", "pipeline", "non-atomic"],
|
| 173 |
+
"severity": "critical",
|
| 174 |
+
"issue_type": "concurrency",
|
| 175 |
+
},
|
| 176 |
+
"memory_leak": {
|
| 177 |
+
"lines": [13, 27, 38],
|
| 178 |
+
"description_keywords": ["memory leak", "unbounded", "evict", "capacity", "grow"],
|
| 179 |
+
"severity": "critical",
|
| 180 |
+
"issue_type": "performance",
|
| 181 |
+
},
|
| 182 |
+
"n_plus_one": {
|
| 183 |
+
"lines": [47],
|
| 184 |
+
"description_keywords": ["n+1", "pipeline", "mget", "batch", "loop", "round trip"],
|
| 185 |
+
"severity": "major",
|
| 186 |
+
"issue_type": "performance",
|
| 187 |
+
},
|
| 188 |
+
"wrong_lru_order": {
|
| 189 |
+
"lines": [21, 24],
|
| 190 |
+
"description_keywords": ["lru", "order", "move_to_end", "access order", "insertion order", "OrderedDict"],
|
| 191 |
+
"severity": "major",
|
| 192 |
+
"issue_type": "logic",
|
| 193 |
+
},
|
| 194 |
+
"thread_safety": {
|
| 195 |
+
"lines": [13],
|
| 196 |
+
"description_keywords": ["thread", "lock", "concurrent", "race", "mutex", "atomic"],
|
| 197 |
+
"severity": "critical",
|
| 198 |
+
"issue_type": "concurrency",
|
| 199 |
+
},
|
| 200 |
+
"pickle_injection": {
|
| 201 |
+
"lines": [26],
|
| 202 |
+
"description_keywords": ["pickle", "deseri", "arbitrary code", "injection", "untrusted", "json"],
|
| 203 |
+
"severity": "critical",
|
| 204 |
+
"issue_type": "security",
|
| 205 |
+
},
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
PULL_REQUEST = {
|
| 209 |
+
"pull_request_title": "Introduce DistributedLRUCache with Redis backing for session store",
|
| 210 |
+
"author": "senior-eng",
|
| 211 |
+
"description": (
|
| 212 |
+
"Implements a two-tier LRU cache (local + Redis) to reduce DB load by 60%. "
|
| 213 |
+
"Designed for high-throughput production use. Please review thoroughly."
|
| 214 |
+
),
|
| 215 |
+
"files_changed": [
|
| 216 |
+
{
|
| 217 |
+
"filename": "cache.py",
|
| 218 |
+
"language": "python",
|
| 219 |
+
"content": BUGGY_CODE,
|
| 220 |
+
"line_count": BUGGY_CODE.count("\n") + 1,
|
| 221 |
+
}
|
| 222 |
+
],
|
| 223 |
+
"test_results": "Unit tests pass. Load tests not yet run.",
|
| 224 |
+
"linter_output": "No issues found by flake8.",
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
def get_task_config() -> Dict[str, Any]:
|
| 229 |
+
return {
|
| 230 |
+
"task_id": TASK_ID,
|
| 231 |
+
"max_steps": MAX_STEPS,
|
| 232 |
+
"pull_request": PULL_REQUEST,
|
| 233 |
+
"known_issues": KNOWN_ISSUES,
|
| 234 |
+
"fixed_code": FIXED_CODE,
|
| 235 |
+
"difficulty": "hard",
|
| 236 |
+
"description": (
|
| 237 |
+
"Review a production-grade distributed LRU cache implementation. "
|
| 238 |
+
"Identify all concurrency, performance, correctness, and security issues. "
|
| 239 |
+
"Provide a fully corrected implementation."
|
| 240 |
+
),
|
| 241 |
+
}
|
tests/test_env.py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Test suite: validates OpenEnv compliance and grader correctness.
|
| 3 |
+
Run with: python tests/test_env.py
|
| 4 |
+
"""
|
| 5 |
+
import sys, os
|
| 6 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
| 7 |
+
|
| 8 |
+
from env import CodeReviewEnv, TASK_IDS
|
| 9 |
+
from models import ReviewAction, Observation, StepReward, EnvironmentState
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def test_reset_returns_observation():
|
| 13 |
+
for task_id in TASK_IDS:
|
| 14 |
+
env = CodeReviewEnv()
|
| 15 |
+
obs = env.reset(task_id)
|
| 16 |
+
assert isinstance(obs, Observation), f"reset() must return Observation for {task_id}"
|
| 17 |
+
assert obs.step == 0
|
| 18 |
+
assert obs.task_id == task_id
|
| 19 |
+
assert len(obs.review_context.files_changed) > 0
|
| 20 |
+
print("β reset() returns valid Observation for all tasks")
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def test_state_returns_environment_state():
|
| 24 |
+
env = CodeReviewEnv()
|
| 25 |
+
env.reset(TASK_IDS[0])
|
| 26 |
+
s = env.state()
|
| 27 |
+
assert isinstance(s, EnvironmentState)
|
| 28 |
+
assert s.step == 0
|
| 29 |
+
print("β state() returns EnvironmentState")
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def test_step_returns_tuple():
|
| 33 |
+
env = CodeReviewEnv()
|
| 34 |
+
env.reset(TASK_IDS[0])
|
| 35 |
+
action = ReviewAction(
|
| 36 |
+
action_type="review",
|
| 37 |
+
severity="critical",
|
| 38 |
+
issue_type="bug",
|
| 39 |
+
line_number=3,
|
| 40 |
+
description="test description",
|
| 41 |
+
)
|
| 42 |
+
obs, reward, done, info = env.step(action)
|
| 43 |
+
assert isinstance(obs, Observation)
|
| 44 |
+
assert isinstance(reward, StepReward)
|
| 45 |
+
assert isinstance(done, bool)
|
| 46 |
+
assert isinstance(info, dict)
|
| 47 |
+
print("β step() returns (Observation, StepReward, bool, dict)")
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def test_reward_range():
|
| 51 |
+
env = CodeReviewEnv()
|
| 52 |
+
env.reset(TASK_IDS[0])
|
| 53 |
+
for _ in range(3):
|
| 54 |
+
action = ReviewAction(action_type="review", severity="minor",
|
| 55 |
+
issue_type="style", description="some issue")
|
| 56 |
+
_, reward, done, _ = env.step(action)
|
| 57 |
+
assert -1.0 <= reward.value <= 1.0, f"Reward {reward.value} out of range"
|
| 58 |
+
if done:
|
| 59 |
+
break
|
| 60 |
+
print("β All intermediate rewards in [-1.0, 1.0]")
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def test_done_on_submit():
|
| 64 |
+
env = CodeReviewEnv()
|
| 65 |
+
env.reset(TASK_IDS[0])
|
| 66 |
+
action = ReviewAction(action_type="submit", verdict="request_changes", confidence=0.5)
|
| 67 |
+
_, _, done, info = env.step(action)
|
| 68 |
+
assert done is True
|
| 69 |
+
assert "final_score" in info
|
| 70 |
+
assert 0.0 <= info["final_score"] <= 1.0
|
| 71 |
+
print("β Episode terminates on submit with final_score in [0.0, 1.0]")
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def test_done_on_max_steps():
|
| 75 |
+
env = CodeReviewEnv()
|
| 76 |
+
env.reset(TASK_IDS[0])
|
| 77 |
+
max_steps = env.state().max_steps
|
| 78 |
+
done = False
|
| 79 |
+
for _ in range(max_steps + 5):
|
| 80 |
+
action = ReviewAction(action_type="comment", comment="still reviewing")
|
| 81 |
+
_, _, done, info = env.step(action)
|
| 82 |
+
if done:
|
| 83 |
+
break
|
| 84 |
+
assert done is True, "Episode should terminate at max_steps"
|
| 85 |
+
print("β Episode terminates at max_steps")
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def test_perfect_score_task1():
|
| 89 |
+
env = CodeReviewEnv()
|
| 90 |
+
env.reset("task_1_easy_bug_hunt")
|
| 91 |
+
actions = [
|
| 92 |
+
ReviewAction(action_type="review", severity="critical", issue_type="bug",
|
| 93 |
+
line_number=3, description="assignment operator = instead of == comparison operator"),
|
| 94 |
+
ReviewAction(action_type="review", severity="critical", issue_type="bug",
|
| 95 |
+
line_number=6, description="off-by-one: range should be len(numbers) not len+1 IndexError"),
|
| 96 |
+
ReviewAction(action_type="review", severity="major", issue_type="bug",
|
| 97 |
+
line_number=9, description="missing return statement returns None"),
|
| 98 |
+
ReviewAction(action_type="patch",
|
| 99 |
+
patched_code="def find_max(numbers):\n if len(numbers) == 0:\n raise ValueError()\n max_val = numbers[0]\n for i in range(1, len(numbers)):\n if numbers[i] > max_val:\n max_val = numbers[i]\n return max_val"),
|
| 100 |
+
ReviewAction(action_type="submit", verdict="request_changes", confidence=0.99),
|
| 101 |
+
]
|
| 102 |
+
done = False
|
| 103 |
+
for a in actions:
|
| 104 |
+
if done: break
|
| 105 |
+
_, _, done, info = env.step(a)
|
| 106 |
+
assert info["final_score"] == 1.0, f"Expected 1.0, got {info['final_score']}"
|
| 107 |
+
print("β Task 1 perfect score achievable")
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def test_zero_score_no_actions():
|
| 111 |
+
env = CodeReviewEnv()
|
| 112 |
+
env.reset("task_2_medium_security")
|
| 113 |
+
action = ReviewAction(action_type="submit", verdict="approve", confidence=0.1)
|
| 114 |
+
_, _, done, info = env.step(action)
|
| 115 |
+
assert info["final_score"] < 0.1, f"Blind approve should score near 0, got {info['final_score']}"
|
| 116 |
+
print("β Blind approve scores near 0")
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def test_repetition_penalty():
|
| 120 |
+
env = CodeReviewEnv()
|
| 121 |
+
env.reset(TASK_IDS[0])
|
| 122 |
+
same_action = ReviewAction(action_type="review", severity="minor",
|
| 123 |
+
issue_type="style", description="identical description here")
|
| 124 |
+
env.step(same_action)
|
| 125 |
+
_, reward2, _, _ = env.step(same_action)
|
| 126 |
+
assert reward2.breakdown.get("repetition_penalty", 0) < 0, "Repetition should be penalised"
|
| 127 |
+
print("β Repetition penalty applied for identical descriptions")
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def test_state_immutability():
|
| 131 |
+
"""state() should return a copy, not a live reference."""
|
| 132 |
+
env = CodeReviewEnv()
|
| 133 |
+
env.reset(TASK_IDS[0])
|
| 134 |
+
s1 = env.state()
|
| 135 |
+
env.step(ReviewAction(action_type="comment", comment="hi"))
|
| 136 |
+
s2 = env.state()
|
| 137 |
+
assert s1.step != s2.step, "state() must return a snapshot copy"
|
| 138 |
+
print("β state() returns immutable snapshot")
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
if __name__ == "__main__":
|
| 142 |
+
tests = [
|
| 143 |
+
test_reset_returns_observation,
|
| 144 |
+
test_state_returns_environment_state,
|
| 145 |
+
test_step_returns_tuple,
|
| 146 |
+
test_reward_range,
|
| 147 |
+
test_done_on_submit,
|
| 148 |
+
test_done_on_max_steps,
|
| 149 |
+
test_perfect_score_task1,
|
| 150 |
+
test_zero_score_no_actions,
|
| 151 |
+
test_repetition_penalty,
|
| 152 |
+
test_state_immutability,
|
| 153 |
+
]
|
| 154 |
+
passed = 0
|
| 155 |
+
for t in tests:
|
| 156 |
+
try:
|
| 157 |
+
t()
|
| 158 |
+
passed += 1
|
| 159 |
+
except Exception as e:
|
| 160 |
+
print(f"β {t.__name__}: {e}")
|
| 161 |
+
print(f"\n{passed}/{len(tests)} tests passed")
|
| 162 |
+
sys.exit(0 if passed == len(tests) else 1)
|