Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- 02-deployment.md +427 -0
- Dockerfile +18 -0
- GRADING.md +253 -0
- README.md +93 -5
- __init__.py +11 -0
- client.py +34 -0
- dataset/build_dataset.py +461 -0
- dataset/category_similarity.json +17 -0
- dataset/fixtures/toy_project/src/math_utils.py +6 -0
- dataset/fixtures/toy_project/tests/test_flaky.py +7 -0
- env/__init__.py +4 -0
- env/environment.py +216 -0
- env/models.py +42 -0
- env/sandbox.py +241 -0
- env/task_loader.py +69 -0
- flakysleuth_build_plan.md +1236 -0
- graders/__init__.py +17 -0
- graders/task1_grader.py +16 -0
- graders/task2_grader.py +59 -0
- graders/task3_grader.py +161 -0
- inference.py +298 -0
- inference_compliance.py +188 -0
- inference_debug.py +606 -0
- models.py +3 -0
- openenv.yaml +37 -0
- pyproject.toml +34 -0
- requirements.txt +10 -0
- server.py +8 -0
- server/__init__.py +3 -0
- server/app.py +102 -0
- tests/test_compliance.py +18 -0
- uv.lock +0 -0
02-deployment.md
ADDED
|
@@ -0,0 +1,427 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 2. Deploying an OpenEnv environment
|
| 2 |
+
|
| 3 |
+
This section covers deploying OpenEnv environments locally, on clusters, and on Hugging Face Spaces.
|
| 4 |
+
|
| 5 |
+
**Contents:**
|
| 6 |
+
- [Local Development with Uvicorn](#local-development-with-uvicorn)
|
| 7 |
+
- [Docker Deployment](#docker-deployment)
|
| 8 |
+
- [Hugging Face Spaces](#hugging-face-spaces)
|
| 9 |
+
- [Best Practices](#best-practices)
|
| 10 |
+
|
| 11 |
+
## HF Spaces are the infrastructure for OpenEnv environments
|
| 12 |
+
|
| 13 |
+
Every HF Space provides three things that OpenEnv environments need:
|
| 14 |
+
|
| 15 |
+
| Component | What it provides | How to access | Used as |
|
| 16 |
+
|-----------|------------------|---------------|-----------|
|
| 17 |
+
| **Server** | Running environment endpoint | `https://<username>-<space-name>.hf.space` | Agent and Public API |
|
| 18 |
+
| **Repository** | Installable Python package | `pip install git+https://huggingface.co/spaces/<username>-<space-name>` | Code and client |
|
| 19 |
+
| **Registry** | Docker container image | `docker pull registry.hf.space/<username>-<space-name>:latest` | Deployment |
|
| 20 |
+
|
| 21 |
+
This means a single Space deployment gives you all the components you need to use an environment in training.
|
| 22 |
+
|
| 23 |
+
### 1. Server: A running environment endpoint
|
| 24 |
+
|
| 25 |
+
When you deploy to HF Spaces, your environment runs as a server. The client connects via **WebSocket** (`/ws`) for a persistent session:
|
| 26 |
+
|
| 27 |
+
```python
|
| 28 |
+
from echo_env import EchoEnv, EchoAction
|
| 29 |
+
|
| 30 |
+
# Connect directly to the running Space (WebSocket under the hood)
|
| 31 |
+
# Async (recommended):
|
| 32 |
+
async with EchoEnv(base_url="https://openenv-echo-env.hf.space") as client:
|
| 33 |
+
result = await client.reset()
|
| 34 |
+
result = await client.step(EchoAction(message="Hello"))
|
| 35 |
+
|
| 36 |
+
# Sync (using .sync() wrapper):
|
| 37 |
+
with EchoEnv(base_url="https://openenv-echo-env.hf.space").sync() as client:
|
| 38 |
+
result = client.reset()
|
| 39 |
+
result = client.step(EchoAction(message="Hello"))
|
| 40 |
+
```
|
| 41 |
+
|
| 42 |
+
**Endpoints available:**
|
| 43 |
+
|
| 44 |
+
| Endpoint | Protocol | Description |
|
| 45 |
+
|----------|----------|-------------|
|
| 46 |
+
| `/ws` | **WebSocket** | Persistent session (used by client) |
|
| 47 |
+
| `/health` | HTTP GET | Health check |
|
| 48 |
+
| `/reset` | HTTP POST | Reset environment (stateless) |
|
| 49 |
+
| `/step` | HTTP POST | Execute action (stateless) |
|
| 50 |
+
| `/state` | HTTP GET | Get current state |
|
| 51 |
+
| `/docs` | HTTP GET | OpenAPI documentation |
|
| 52 |
+
| `/web` | HTTP GET | Interactive web UI |
|
| 53 |
+
|
| 54 |
+
> **Note:** The Python client uses the `/ws` WebSocket endpoint by default. HTTP endpoints are available for debugging or stateless use cases.
|
| 55 |
+
|
| 56 |
+
**Example: Check if a Space is running**
|
| 57 |
+
|
| 58 |
+
```bash
|
| 59 |
+
curl https://openenv-echo-env.hf.space/health
|
| 60 |
+
# {"status": "healthy"}
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
### 2. Repository: Installable Python package
|
| 64 |
+
|
| 65 |
+
Every Space is a Git repository. OpenEnv environments include a `pyproject.toml`, making them pip-installable directly from the Space URL.
|
| 66 |
+
|
| 67 |
+
```bash
|
| 68 |
+
# Install client package from Space
|
| 69 |
+
pip install git+https://huggingface.co/spaces/openenv/echo-env
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
This installs:
|
| 73 |
+
- **Client class** (`EchoEnv`) — Handles HTTP/WebSocket communication
|
| 74 |
+
- **Models** (`EchoAction`, `EchoObservation`) — Typed action and observation classes
|
| 75 |
+
- **Utilities** — Any helper functions the environment provides
|
| 76 |
+
|
| 77 |
+
**After installation:**
|
| 78 |
+
|
| 79 |
+
```python
|
| 80 |
+
from envs.echo_env import EchoEnv, EchoAction, EchoObservation
|
| 81 |
+
|
| 82 |
+
# Now you have typed classes for the environment
|
| 83 |
+
action = EchoAction(message="Hello")
|
| 84 |
+
```
|
| 85 |
+
|
| 86 |
+
### 3. Registry: Docker container image
|
| 87 |
+
|
| 88 |
+
Every Docker-based Space has a container registry. You can pull and run the environment locally.
|
| 89 |
+
|
| 90 |
+
```bash
|
| 91 |
+
# Pull the image
|
| 92 |
+
docker pull registry.hf.space/openenv-echo-env:latest
|
| 93 |
+
|
| 94 |
+
# Run locally on port 8001
|
| 95 |
+
docker run -d -p 8001:8000 registry.hf.space/openenv-echo-env:latest
|
| 96 |
+
```
|
| 97 |
+
|
| 98 |
+
**Find the registry URL for any Space:**
|
| 99 |
+
|
| 100 |
+
1. Go to the Space page (e.g., [openenv/echo-env](https://huggingface.co/spaces/openenv/echo-env))
|
| 101 |
+
2. Click **⋮** (three dots) → **"Run locally"**
|
| 102 |
+
3. Copy the `docker run` command
|
| 103 |
+
|
| 104 |
+
### Choosing an access method
|
| 105 |
+
|
| 106 |
+
| Method | Use when | Pros | Cons |
|
| 107 |
+
|--------|----------|------|------|
|
| 108 |
+
| **Server** | Quick testing, low volume | Zero setup | Network latency, rate limits |
|
| 109 |
+
| **Repository** | Need typed classes | Type safety, IDE support | Still need a server |
|
| 110 |
+
| **Docker** | Local dev, high throughput | Full control, no network | Requires Docker |
|
| 111 |
+
|
| 112 |
+
**Typical workflow:**
|
| 113 |
+
|
| 114 |
+
```python
|
| 115 |
+
import asyncio
|
| 116 |
+
from echo_env import EchoEnv, EchoAction
|
| 117 |
+
|
| 118 |
+
async def main():
|
| 119 |
+
# Development: connect to remote Space
|
| 120 |
+
async with EchoEnv(base_url="https://openenv-echo-env.hf.space") as client:
|
| 121 |
+
result = await client.reset()
|
| 122 |
+
|
| 123 |
+
# Production: run locally for speed
|
| 124 |
+
# docker run -d -p 8001:8000 registry.hf.space/openenv-echo-env:latest
|
| 125 |
+
async with EchoEnv(base_url="http://localhost:8001") as client:
|
| 126 |
+
result = await client.reset()
|
| 127 |
+
|
| 128 |
+
# Or let the client manage Docker for you
|
| 129 |
+
client = await EchoEnv.from_env("openenv/echo-env") # Auto-pulls and runs
|
| 130 |
+
async with client:
|
| 131 |
+
result = await client.reset()
|
| 132 |
+
|
| 133 |
+
asyncio.run(main())
|
| 134 |
+
|
| 135 |
+
# For sync usage, use the .sync() wrapper:
|
| 136 |
+
with EchoEnv(base_url="http://localhost:8001").sync() as client:
|
| 137 |
+
result = client.reset()
|
| 138 |
+
```
|
| 139 |
+
|
| 140 |
+
> **Reference:** [HF Spaces Documentation](https://huggingface.co/docs/hub/spaces) | [Environment Hub Collection](https://huggingface.co/collections/openenv/environment-hub)
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
## Local Development with Uvicorn
|
| 144 |
+
|
| 145 |
+
The fastest way to iterate on environment logic is running directly with Uvicorn.
|
| 146 |
+
|
| 147 |
+
## Clone and run the environment locally
|
| 148 |
+
|
| 149 |
+
```bash
|
| 150 |
+
# Clone from HF Space
|
| 151 |
+
git clone https://huggingface.co/spaces/burtenshaw/openenv-benchmark
|
| 152 |
+
cd openenv-benchmark
|
| 153 |
+
|
| 154 |
+
# Install in editable mode
|
| 155 |
+
uv sync
|
| 156 |
+
|
| 157 |
+
# Start server
|
| 158 |
+
uv run server
|
| 159 |
+
|
| 160 |
+
# Run isolated from remote Space
|
| 161 |
+
uv run --isolated --project https://huggingface.co/spaces/burtenshaw/openenv-benchmark server
|
| 162 |
+
```
|
| 163 |
+
|
| 164 |
+
## Uvicorn directly in python
|
| 165 |
+
|
| 166 |
+
```bash
|
| 167 |
+
# Full control over uvicorn options
|
| 168 |
+
uvicorn benchmark.server.app:app --host "$HOST" --port "$PORT" --workers "$WORKERS"
|
| 169 |
+
|
| 170 |
+
# With reload for development
|
| 171 |
+
uvicorn benchmark.server.app:app --host 0.0.0.0 --port 8000 --reload
|
| 172 |
+
|
| 173 |
+
# Multi-Worker Mode For better concurrency:
|
| 174 |
+
uvicorn benchmark.server.app:app --host 0.0.0.0 --port 8000 --workers 4
|
| 175 |
+
```
|
| 176 |
+
|
| 177 |
+
| Flag | Purpose |
|
| 178 |
+
|------|---------|
|
| 179 |
+
| `--reload` | Auto-restart on code changes |
|
| 180 |
+
| `--workers N` | Run N worker processes |
|
| 181 |
+
| `--log-level debug` | Verbose logging |
|
| 182 |
+
|
| 183 |
+
## Docker Deployment
|
| 184 |
+
|
| 185 |
+
Docker provides isolation and reproducibility for production use.
|
| 186 |
+
|
| 187 |
+
### Run the environment locally from the space
|
| 188 |
+
|
| 189 |
+
```bash
|
| 190 |
+
# Run the environment locally from the space
|
| 191 |
+
docker run -d -p 8000:8000 registry.hf.space/openenv-echo-env:latest
|
| 192 |
+
```
|
| 193 |
+
|
| 194 |
+
### Build Image
|
| 195 |
+
|
| 196 |
+
```bash
|
| 197 |
+
# Clone from HF Space
|
| 198 |
+
git clone https://huggingface.co/spaces/burtenshaw/openenv-benchmark
|
| 199 |
+
cd openenv-benchmark
|
| 200 |
+
|
| 201 |
+
# Using OpenEnv CLI (recommended)
|
| 202 |
+
openenv build -t openenv-benchmark:latest
|
| 203 |
+
|
| 204 |
+
# Or with Docker directly
|
| 205 |
+
docker build -t openenv-benchmark:latest -f server/Dockerfile .
|
| 206 |
+
```
|
| 207 |
+
|
| 208 |
+
### Run Container
|
| 209 |
+
|
| 210 |
+
```bash
|
| 211 |
+
# Basic run
|
| 212 |
+
docker run -d -p 8000:8000 my-env:latest
|
| 213 |
+
|
| 214 |
+
# With environment variables
|
| 215 |
+
docker run -d -p 8000:8000 \
|
| 216 |
+
-e WORKERS=4 \
|
| 217 |
+
-e MAX_CONCURRENT_ENVS=100 \
|
| 218 |
+
my-env:latest
|
| 219 |
+
|
| 220 |
+
# Named container for easy management
|
| 221 |
+
docker run -d --name my-env -p 8000:8000 my-env:latest
|
| 222 |
+
```
|
| 223 |
+
|
| 224 |
+
### Connect from Python
|
| 225 |
+
|
| 226 |
+
```python
|
| 227 |
+
import asyncio
|
| 228 |
+
from echo_env import EchoEnv, EchoAction
|
| 229 |
+
|
| 230 |
+
async def main():
|
| 231 |
+
# Async usage (recommended)
|
| 232 |
+
async with EchoEnv(base_url="http://localhost:8000") as client:
|
| 233 |
+
result = await client.reset()
|
| 234 |
+
result = await client.step(EchoAction(message="Hello"))
|
| 235 |
+
print(result.observation)
|
| 236 |
+
|
| 237 |
+
# From Docker image
|
| 238 |
+
client = await EchoEnv.from_docker_image("<local_docker_image>")
|
| 239 |
+
async with client:
|
| 240 |
+
result = await client.reset()
|
| 241 |
+
print(result.observation)
|
| 242 |
+
|
| 243 |
+
asyncio.run(main())
|
| 244 |
+
|
| 245 |
+
# Sync usage (using .sync() wrapper)
|
| 246 |
+
with EchoEnv(base_url="http://localhost:8000").sync() as client:
|
| 247 |
+
result = client.reset()
|
| 248 |
+
result = client.step(EchoAction(message="Hello"))
|
| 249 |
+
print(result.observation)
|
| 250 |
+
```
|
| 251 |
+
|
| 252 |
+
### Container Lifecycle
|
| 253 |
+
|
| 254 |
+
| Method | Container | WebSocket | On `close()` |
|
| 255 |
+
|--------|-----------|-----------|--------------|
|
| 256 |
+
| `from_hub(repo_id)` | Starts | Connects | Stops container |
|
| 257 |
+
| `from_hub(repo_id, use_docker=False)` | None (UV) | Connects | Stops UV server |
|
| 258 |
+
| `from_docker_image(image)` | Starts | Connects | Stops container |
|
| 259 |
+
| `MyEnv(base_url=...)` | None | Connects | Disconnects only |
|
| 260 |
+
|
| 261 |
+
Find Docker Commands for Any Space
|
| 262 |
+
|
| 263 |
+
1. Open the Space on HuggingFace Hub
|
| 264 |
+
2. Click **⋮ (three dots)** menu
|
| 265 |
+
3. Select **"Run locally"**
|
| 266 |
+
4. Copy the provided `docker run` command
|
| 267 |
+
|
| 268 |
+
## Deploy with CLI
|
| 269 |
+
|
| 270 |
+
```bash
|
| 271 |
+
cd my_env
|
| 272 |
+
|
| 273 |
+
# Deploy to your namespace
|
| 274 |
+
openenv push
|
| 275 |
+
|
| 276 |
+
# Deploy to specific repo
|
| 277 |
+
openenv push --repo-id username/my-env
|
| 278 |
+
|
| 279 |
+
# Deploy as private
|
| 280 |
+
openenv push --repo-id username/my-env --private
|
| 281 |
+
```
|
| 282 |
+
|
| 283 |
+
### Space Configuration
|
| 284 |
+
|
| 285 |
+
The `openenv.yaml` manifest controls Space settings:
|
| 286 |
+
|
| 287 |
+
```yaml
|
| 288 |
+
# openenv.yaml
|
| 289 |
+
name: my_env
|
| 290 |
+
version: "1.0.0"
|
| 291 |
+
description: My custom environment
|
| 292 |
+
```
|
| 293 |
+
|
| 294 |
+
Hardware Options:
|
| 295 |
+
|
| 296 |
+
| Tier | vCPU | RAM | Cost |
|
| 297 |
+
|------|------|-----|------|
|
| 298 |
+
| CPU Basic (Free) | 2 | 16GB | Free |
|
| 299 |
+
| CPU Upgrade | 8 | 32GB | $0.03/hr |
|
| 300 |
+
|
| 301 |
+
OpenEnv environments support configuration via environment variables.
|
| 302 |
+
|
| 303 |
+
| Variable | Default | Description |
|
| 304 |
+
|----------|---------|-------------|
|
| 305 |
+
| `WORKERS` | 4 | Uvicorn worker processes |
|
| 306 |
+
| `PORT` | 8000 | Server port |
|
| 307 |
+
| `HOST` | 0.0.0.0 | Bind address |
|
| 308 |
+
| `MAX_CONCURRENT_ENVS` | 100 | Max WebSocket sessions |
|
| 309 |
+
| `ENABLE_WEB_INTERFACE` | Auto | Enable web UI |
|
| 310 |
+
|
| 311 |
+
### Environment-Specific Variables
|
| 312 |
+
|
| 313 |
+
Some environments have custom variables:
|
| 314 |
+
|
| 315 |
+
**TextArena:**
|
| 316 |
+
```bash
|
| 317 |
+
TEXTARENA_ENV_ID=Wordle-v0
|
| 318 |
+
TEXTARENA_NUM_PLAYERS=1
|
| 319 |
+
TEXTARENA_MAX_TURNS=6
|
| 320 |
+
```
|
| 321 |
+
|
| 322 |
+
**Coding Environment:**
|
| 323 |
+
```bash
|
| 324 |
+
SANDBOX_TIMEOUT=30
|
| 325 |
+
MAX_OUTPUT_LENGTH=10000
|
| 326 |
+
```
|
| 327 |
+
|
| 328 |
+
# DEMO: Deploying to Hugging Face Spaces
|
| 329 |
+
|
| 330 |
+
This demo walks through the full workflow: create an environment, test locally, deploy to HF Spaces, and use it.
|
| 331 |
+
|
| 332 |
+
## Step 1: Initialize a new environment
|
| 333 |
+
|
| 334 |
+
```bash
|
| 335 |
+
openenv init my_env
|
| 336 |
+
cd my_env
|
| 337 |
+
```
|
| 338 |
+
|
| 339 |
+
This creates the standard OpenEnv structure:
|
| 340 |
+
|
| 341 |
+
```
|
| 342 |
+
my_env/
|
| 343 |
+
├── server/
|
| 344 |
+
│ ├── app.py # FastAPI server
|
| 345 |
+
│ ├── environment.py # Your environment logic
|
| 346 |
+
│ └── Dockerfile
|
| 347 |
+
├── models.py # Action/Observation types
|
| 348 |
+
├── client.py # HTTP client
|
| 349 |
+
├── openenv.yaml # Manifest
|
| 350 |
+
└── pyproject.toml
|
| 351 |
+
```
|
| 352 |
+
|
| 353 |
+
## Step 2: Run locally
|
| 354 |
+
|
| 355 |
+
```bash
|
| 356 |
+
# Start the server
|
| 357 |
+
uv run server
|
| 358 |
+
|
| 359 |
+
# Or with uvicorn directly
|
| 360 |
+
uvicorn server.app:app --host 0.0.0.0 --port 8000 --reload
|
| 361 |
+
```
|
| 362 |
+
|
| 363 |
+
Test the health endpoint:
|
| 364 |
+
|
| 365 |
+
```bash
|
| 366 |
+
curl http://localhost:8000/health
|
| 367 |
+
# {"status": "healthy"}
|
| 368 |
+
```
|
| 369 |
+
|
| 370 |
+
## Step 3: Deploy to HF Spaces
|
| 371 |
+
|
| 372 |
+
```bash
|
| 373 |
+
openenv push --repo-id username/my-env
|
| 374 |
+
```
|
| 375 |
+
|
| 376 |
+
Your environment is now live at:
|
| 377 |
+
- Web UI: https://username-my-env.hf.space/web
|
| 378 |
+
- API Docs: https://username-my-env.hf.space/docs
|
| 379 |
+
- Health: https://username-my-env.hf.space/health
|
| 380 |
+
|
| 381 |
+
```bash
|
| 382 |
+
curl https://openenv-echo-env.hf.space/health
|
| 383 |
+
# {"status": "healthy"}
|
| 384 |
+
```
|
| 385 |
+
|
| 386 |
+
## Step 4: install the environment
|
| 387 |
+
|
| 388 |
+
```bash
|
| 389 |
+
uv pip install git+https://huggingface.co/spaces/openenv/echo_env
|
| 390 |
+
```
|
| 391 |
+
|
| 392 |
+
## Step 5: Run locally via Docker (optional)
|
| 393 |
+
|
| 394 |
+
Pull and run the container from the HF registry, or open the [browser](https://huggingface.co/spaces/openenv/echo_env?docker=true):
|
| 395 |
+
|
| 396 |
+
```bash
|
| 397 |
+
# Pull from HF Spaces registry
|
| 398 |
+
docker pull registry.hf.space/openenv-echo-env:latest
|
| 399 |
+
|
| 400 |
+
# Run locally
|
| 401 |
+
docker run -it -p 7860:7860 --platform=linux/amd64 \
|
| 402 |
+
registry.hf.space/openenv-echo-env:latest
|
| 403 |
+
```
|
| 404 |
+
|
| 405 |
+
Now connect to your local instance:
|
| 406 |
+
|
| 407 |
+
```python
|
| 408 |
+
import asyncio
|
| 409 |
+
from echo_env import EchoEnv, EchoAction
|
| 410 |
+
|
| 411 |
+
# Async (recommended)
|
| 412 |
+
async def main():
|
| 413 |
+
async with EchoEnv(base_url="http://localhost:8000") as env:
|
| 414 |
+
result = await env.reset()
|
| 415 |
+
print(result.observation)
|
| 416 |
+
result = await env.step(EchoAction(message="Hello"))
|
| 417 |
+
print(result.observation)
|
| 418 |
+
|
| 419 |
+
asyncio.run(main())
|
| 420 |
+
|
| 421 |
+
# Sync (using .sync() wrapper)
|
| 422 |
+
with EchoEnv(base_url="http://localhost:8000").sync() as env:
|
| 423 |
+
result = env.reset()
|
| 424 |
+
print(result.observation)
|
| 425 |
+
result = env.step(EchoAction(message="Hello"))
|
| 426 |
+
print(result.observation)
|
| 427 |
+
```
|
Dockerfile
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 4 |
+
git \
|
| 5 |
+
patch \
|
| 6 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 7 |
+
|
| 8 |
+
WORKDIR /app
|
| 9 |
+
|
| 10 |
+
COPY requirements.txt .
|
| 11 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 12 |
+
|
| 13 |
+
COPY . .
|
| 14 |
+
|
| 15 |
+
EXPOSE 8000
|
| 16 |
+
|
| 17 |
+
ENV ENABLE_WEB_INTERFACE=true
|
| 18 |
+
CMD ["python", "-m", "server.app"]
|
GRADING.md
ADDED
|
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# FlakySleuth Grading: Exact Scoring Formulas
|
| 2 |
+
|
| 3 |
+
This document describes the **exact scoring logic implemented in code** for:
|
| 4 |
+
- Task 1: `classify` (`classify_flakiness`)
|
| 5 |
+
- Task 2: `root_cause` (`classify_root_cause`)
|
| 6 |
+
- Task 3: `fix_proposal` (`propose_fix`)
|
| 7 |
+
|
| 8 |
+
It also explains how per-step rewards are combined inside the environment.
|
| 9 |
+
|
| 10 |
+
## Source of Truth
|
| 11 |
+
|
| 12 |
+
- `env/environment.py`
|
| 13 |
+
- `graders/__init__.py`
|
| 14 |
+
- `graders/task1_grader.py`
|
| 15 |
+
- `graders/task2_grader.py`
|
| 16 |
+
- `graders/task3_grader.py`
|
| 17 |
+
- `dataset/category_similarity.json`
|
| 18 |
+
|
| 19 |
+
## 1) Dispatch: Which grader is used?
|
| 20 |
+
|
| 21 |
+
`graders/grade_action()` selects grader by `task["task_type"]`:
|
| 22 |
+
- `classify` -> Task 1 grader
|
| 23 |
+
- `root_cause` -> Task 2 grader
|
| 24 |
+
- `fix_proposal` -> Task 3 grader
|
| 25 |
+
- anything else -> `0.0`
|
| 26 |
+
|
| 27 |
+
## 2) Environment reward pipeline (applies to all tasks)
|
| 28 |
+
|
| 29 |
+
At each `env.step(action)`:
|
| 30 |
+
|
| 31 |
+
1. If action is terminal (`classify_flakiness`, `classify_root_cause`, `propose_fix`):
|
| 32 |
+
- compute `terminal_score = grade_action(action, task)`
|
| 33 |
+
- compute penalties
|
| 34 |
+
- final step reward:
|
| 35 |
+
|
| 36 |
+
```text
|
| 37 |
+
reward = clamp(
|
| 38 |
+
cumulative_progress + terminal_score - late_penalty - wrong_dir_penalty,
|
| 39 |
+
0.0,
|
| 40 |
+
1.0
|
| 41 |
+
)
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
Where:
|
| 45 |
+
- `late_penalty = max(0, step_count - 15) * 0.05`
|
| 46 |
+
- `wrong_dir_penalty = 0.2` only when:
|
| 47 |
+
- action is `classify_flakiness`
|
| 48 |
+
- predicted argument is `"stable"`
|
| 49 |
+
- ground-truth label is `"flaky"`
|
| 50 |
+
- `done = True`
|
| 51 |
+
|
| 52 |
+
2. If action is non-terminal (exploration):
|
| 53 |
+
- compute `progress` from exploration action
|
| 54 |
+
- update cumulative progress:
|
| 55 |
+
|
| 56 |
+
```text
|
| 57 |
+
cumulative_progress = clamp(cumulative_progress + progress, 0.0, 0.30)
|
| 58 |
+
reward = progress
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
3. Timeout rule:
|
| 62 |
+
- if not already done and `step_count >= max_steps`, set `done = True`
|
| 63 |
+
- no additional terminal score is applied at timeout.
|
| 64 |
+
|
| 65 |
+
## 3) Exploration progress rewards (exact values)
|
| 66 |
+
|
| 67 |
+
### `read_file`
|
| 68 |
+
- file missing/unsafe -> `progress = -0.05`
|
| 69 |
+
- file already read in this episode -> `progress = 0.0`
|
| 70 |
+
- new file:
|
| 71 |
+
- if file path contains `task["test_file"]` -> `0.07`
|
| 72 |
+
- else if file ends with `.py` -> `0.03`
|
| 73 |
+
- else -> `0.01`
|
| 74 |
+
|
| 75 |
+
### `search_code`
|
| 76 |
+
- if query contains any flaky-signal tokens (`sleep`, `random`, `time`, `datetime`, `thread`, `asyncio`, `fixture`, `setup`, `teardown`, `global`, `shared`, `singleton`, `os.environ`, `socket`, `timeout`, `retry`, `mock`, `patch`) -> `0.04`
|
| 77 |
+
- otherwise -> `0.01`
|
| 78 |
+
|
| 79 |
+
### `run_test`
|
| 80 |
+
- if category is **not** one of `OD`, `OD-Brit`, `OD-Vic` -> `0.05`
|
| 81 |
+
- if category is order-dependent (`OD`, `OD-Brit`, `OD-Vic`) -> `0.0`
|
| 82 |
+
|
| 83 |
+
### unsupported action type
|
| 84 |
+
- `progress = -0.05`
|
| 85 |
+
|
| 86 |
+
## 4) Task 1 scorer (`classify_flakiness`)
|
| 87 |
+
|
| 88 |
+
Binary exact-match scorer:
|
| 89 |
+
|
| 90 |
+
```text
|
| 91 |
+
if action_type != "classify_flakiness": return 0.0
|
| 92 |
+
if predicted not in {"flaky","stable"}: return 0.0
|
| 93 |
+
truth = task["label"] (default "flaky")
|
| 94 |
+
terminal_score = 1.0 if predicted == truth else 0.0
|
| 95 |
+
```
|
| 96 |
+
|
| 97 |
+
Notes:
|
| 98 |
+
- In current dataset builder, rows are written with `label = "flaky"` by default.
|
| 99 |
+
- Predicting `"stable"` on flaky truth also triggers environment `wrong_dir_penalty = 0.2`.
|
| 100 |
+
|
| 101 |
+
## 5) Task 2 scorer (`classify_root_cause`)
|
| 102 |
+
|
| 103 |
+
Matrix-based similarity scorer.
|
| 104 |
+
|
| 105 |
+
### 5.1 Category normalization
|
| 106 |
+
|
| 107 |
+
Prediction and truth are normalized by:
|
| 108 |
+
- trim
|
| 109 |
+
- replace `_` with `-`
|
| 110 |
+
- replace spaces with `-`
|
| 111 |
+
- uppercase and map through canonical aliases:
|
| 112 |
+
- `OD-BRIT` -> `OD-Brit`
|
| 113 |
+
- `OD-VIC` -> `OD-Vic`
|
| 114 |
+
- etc.
|
| 115 |
+
|
| 116 |
+
If normalized value is not in valid set, score is `0.0`.
|
| 117 |
+
|
| 118 |
+
Truth category is the **first** category if semicolon-separated:
|
| 119 |
+
|
| 120 |
+
```text
|
| 121 |
+
raw_truth = str(task["category"]).split(";")[0]
|
| 122 |
+
```
|
| 123 |
+
|
| 124 |
+
### 5.2 Similarity scoring
|
| 125 |
+
|
| 126 |
+
```text
|
| 127 |
+
if predicted == truth: return 1.0
|
| 128 |
+
else return similarity[predicted,truth] or similarity[truth,predicted] or 0.0
|
| 129 |
+
```
|
| 130 |
+
|
| 131 |
+
The similarity matrix is loaded from `dataset/category_similarity.json`.
|
| 132 |
+
|
| 133 |
+
Current non-identity similarity entries:
|
| 134 |
+
- `OD,OD-Brit`: `0.7`
|
| 135 |
+
- `OD,OD-Vic`: `0.7`
|
| 136 |
+
- `OD-Brit,OD-Vic`: `0.8`
|
| 137 |
+
- `OD,NIO`: `0.4`
|
| 138 |
+
- `OD,NDOI`: `0.3`
|
| 139 |
+
- `NOD,TD`: `0.6`
|
| 140 |
+
- `NOD,TZD`: `0.5`
|
| 141 |
+
- `NOD,NDOI`: `0.5`
|
| 142 |
+
- `TD,TZD`: `0.7`
|
| 143 |
+
- `NOD,ID`: `0.3`
|
| 144 |
+
- `UD,OD`: `0.2`
|
| 145 |
+
- `UD,NOD`: `0.2`
|
| 146 |
+
- `UD,NIO`: `0.2`
|
| 147 |
+
- `UD,TD`: `0.2`
|
| 148 |
+
- `UD,ID`: `0.2`
|
| 149 |
+
|
| 150 |
+
Any missing pair defaults to `0.0`.
|
| 151 |
+
|
| 152 |
+
## 6) Task 3 scorer (`propose_fix`)
|
| 153 |
+
|
| 154 |
+
Hybrid weighted scorer:
|
| 155 |
+
|
| 156 |
+
```text
|
| 157 |
+
if action_type != "propose_fix": return 0.0
|
| 158 |
+
if proposed_fix is empty: return 0.0
|
| 159 |
+
|
| 160 |
+
total = 0.35 * pattern_score + 0.25 * apply_score + 0.40 * judge_score
|
| 161 |
+
terminal_score = round(clamp(total, 0.0, 1.0), 4)
|
| 162 |
+
```
|
| 163 |
+
|
| 164 |
+
### 6.1 `pattern_score`
|
| 165 |
+
|
| 166 |
+
Category-specific keyword patterns are checked against the proposed diff.
|
| 167 |
+
|
| 168 |
+
For category with pattern list:
|
| 169 |
+
|
| 170 |
+
```text
|
| 171 |
+
matches = number of patterns found (case-insensitive substring)
|
| 172 |
+
pattern_score = min(1.0, matches / max(1, len(patterns) * 0.4))
|
| 173 |
+
```
|
| 174 |
+
|
| 175 |
+
If category has no pattern list:
|
| 176 |
+
- `pattern_score = 0.5`
|
| 177 |
+
|
| 178 |
+
Current pattern lists:
|
| 179 |
+
- `TD`: `freeze_time`, `mock`, `patch`, `utcnow`, `datetime`, `monkeypatch`
|
| 180 |
+
- `TZD`: `timezone`, `utc`, `pytz`, `zoneinfo`, `tzinfo`, `UTC`
|
| 181 |
+
- `NOD`: `seed`, `mock`, `patch`, `deterministic`, `sorted`
|
| 182 |
+
- `NIO`: `setup`, `teardown`, `fixture`, `yield`, `cleanup`, `autouse`
|
| 183 |
+
- `ID`: `sorted(`, `list(`, `frozenset`, `OrderedDict`
|
| 184 |
+
|
| 185 |
+
### 6.2 `apply_score` (`_check_diff_applies`)
|
| 186 |
+
|
| 187 |
+
```text
|
| 188 |
+
if diff does not contain both '---' and '+++': return 0.0
|
| 189 |
+
if sandbox_root missing or not existing: return 0.3
|
| 190 |
+
else run: patch --dry-run -p1 -i <temp_patch>
|
| 191 |
+
return 1.0 if patch exit code == 0
|
| 192 |
+
return 0.0 otherwise
|
| 193 |
+
on exception: return 0.3
|
| 194 |
+
```
|
| 195 |
+
|
| 196 |
+
### 6.3 `judge_score` (`_llm_judge`)
|
| 197 |
+
|
| 198 |
+
LLM judge behavior:
|
| 199 |
+
- If no API key available -> `judge_score = 0.5`
|
| 200 |
+
- Else sends a judge prompt asking for JSON `{"score": 0..10, "reason": ...}`
|
| 201 |
+
- Parses integer score, clamps to `[0,10]`, then scales to `[0,1]`:
|
| 202 |
+
|
| 203 |
+
```text
|
| 204 |
+
judge_score = clamp(int_score, 0, 10) / 10
|
| 205 |
+
```
|
| 206 |
+
|
| 207 |
+
- On any judge exception / parse failure -> `judge_score = 0.5`
|
| 208 |
+
|
| 209 |
+
API/model resolution in judge:
|
| 210 |
+
- API key preference: `API_KEY` -> `OPENROUTER_API_KEY` -> `OPENAI_API_KEY`
|
| 211 |
+
- Base URL:
|
| 212 |
+
- OpenRouter inferred -> `https://openrouter.ai/api/v1`
|
| 213 |
+
- else -> `https://api.openai.com/v1`
|
| 214 |
+
- Model default:
|
| 215 |
+
- OpenRouter base URL -> `qwen/qwen3.6-plus:free`
|
| 216 |
+
- else -> `gpt-4o-mini`
|
| 217 |
+
|
| 218 |
+
## 7) Worked examples
|
| 219 |
+
|
| 220 |
+
### Example A: Task 1 correct classify early
|
| 221 |
+
|
| 222 |
+
- `cumulative_progress = 0.05`
|
| 223 |
+
- `terminal_score = 1.0`
|
| 224 |
+
- `late_penalty = 0.0`
|
| 225 |
+
- `wrong_dir_penalty = 0.0`
|
| 226 |
+
|
| 227 |
+
```text
|
| 228 |
+
reward = clamp(0.05 + 1.0 - 0 - 0, 0, 1) = 1.0
|
| 229 |
+
```
|
| 230 |
+
|
| 231 |
+
### Example B: Task 2 wrong category but some exploration
|
| 232 |
+
|
| 233 |
+
- `cumulative_progress = 0.05`
|
| 234 |
+
- `terminal_score = 0.0` (no similarity match)
|
| 235 |
+
- penalties = `0`
|
| 236 |
+
|
| 237 |
+
```text
|
| 238 |
+
reward = clamp(0.05 + 0.0, 0, 1) = 0.05
|
| 239 |
+
```
|
| 240 |
+
|
| 241 |
+
### Example C: Task 3 with weak fix and no API key
|
| 242 |
+
|
| 243 |
+
- `judge_score = 0.5` fallback
|
| 244 |
+
- `apply_score` and `pattern_score` depend on diff contents
|
| 245 |
+
- final weighted sum then clamped and rounded to 4 decimals.
|
| 246 |
+
|
| 247 |
+
## 8) Important implementation notes
|
| 248 |
+
|
| 249 |
+
- `cumulative_progress` is capped at `0.30` and never below `0.0`.
|
| 250 |
+
- Terminal reward can be reduced by late penalty after step 15.
|
| 251 |
+
- Timeout does not invoke grader; it only ends the episode.
|
| 252 |
+
- Dataset construction choices (especially `label` and category quality) heavily influence observed score behavior.
|
| 253 |
+
|
README.md
CHANGED
|
@@ -1,10 +1,98 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: FlakySleuth Environment Server
|
| 3 |
+
emoji: "🔍"
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
+
app_port: 8000
|
| 9 |
+
base_path: /web
|
| 10 |
+
tags:
|
| 11 |
+
- openenv
|
| 12 |
---
|
| 13 |
|
| 14 |
+
# FlakySleuth Environment
|
| 15 |
+
|
| 16 |
+
OpenEnv-compatible RL environment for flaky-test investigation in real Python repos.
|
| 17 |
+
|
| 18 |
+
## Setup
|
| 19 |
+
|
| 20 |
+
```bash
|
| 21 |
+
python3 -m venv .venv
|
| 22 |
+
source .venv/bin/activate
|
| 23 |
+
pip install -r requirements.txt
|
| 24 |
+
```
|
| 25 |
+
|
| 26 |
+
## Build Dataset
|
| 27 |
+
|
| 28 |
+
Input: raw IDoFT CSV (e.g. `py-data.csv`)
|
| 29 |
+
Output: processed task CSV (`dataset/py_tasks.csv`)
|
| 30 |
+
|
| 31 |
+
```bash
|
| 32 |
+
python dataset/build_dataset.py --input py-data.csv --output dataset/py_tasks.csv
|
| 33 |
+
```
|
| 34 |
+
|
| 35 |
+
### `dataset/build_dataset.py` flags
|
| 36 |
+
|
| 37 |
+
| Flag | Type | Default | Description |
|
| 38 |
+
|---|---|---|---|
|
| 39 |
+
| `--input` | `str` | `idoft/py-data.csv` | Path to raw IDoFT CSV |
|
| 40 |
+
| `--output` | `str` | `dataset/py_tasks.csv` | Output processed task CSV |
|
| 41 |
+
| `--validate-only` | bool | `False` | Validate schema + print summary only (no clone/fetch) |
|
| 42 |
+
| `--limit` | `int` | `None` | Process first N rows only |
|
| 43 |
+
|
| 44 |
+
Notes:
|
| 45 |
+
- Uses live GitHub fetch at exact SHAs.
|
| 46 |
+
- Optional `GITHUB_TOKEN` improves PR diff fetching/rate limits.
|
| 47 |
+
|
| 48 |
+
## Run Server
|
| 49 |
+
|
| 50 |
+
```bash
|
| 51 |
+
python -m server.app
|
| 52 |
+
```
|
| 53 |
+
|
| 54 |
+
Quick check:
|
| 55 |
+
```bash
|
| 56 |
+
curl -s http://localhost:8000/health
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
## Run Inference
|
| 60 |
+
|
| 61 |
+
Recommended (OpenRouter):
|
| 62 |
+
|
| 63 |
+
```bash
|
| 64 |
+
export OPENROUTER_API_KEY=your_openrouter_api_key
|
| 65 |
+
export API_BASE_URL=https://openrouter.ai/api/v1
|
| 66 |
+
export MODEL_NAME=qwen/qwen3.6-plus:free
|
| 67 |
+
|
| 68 |
+
python inference.py --dataset-path dataset/py_tasks.csv --episodes-per-task 5
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
### `inference.py` flags
|
| 72 |
+
|
| 73 |
+
| Flag | Type | Default | Description |
|
| 74 |
+
|---|---|---|---|
|
| 75 |
+
| `--dataset-path` | `str` | `dataset/py_tasks.csv` | Processed task CSV used by env |
|
| 76 |
+
| `--episodes-per-task` | `int` | `5` | Episodes per selected task type |
|
| 77 |
+
| `--task-types` | `str` | `classify,root_cause,fix_proposal` | Comma-separated task types |
|
| 78 |
+
| `--no-progress` | bool | `False` | Disable progress bars |
|
| 79 |
+
| `--trace-agent` | bool | `False` | Print model output, action/tool call, and step results |
|
| 80 |
+
| `--trace-prompts` | bool | `False` | Also print prompts sent to the model |
|
| 81 |
+
| `--trace-max-chars` | `int` | `2500` | Max chars per traced block |
|
| 82 |
+
|
| 83 |
+
Trace to log:
|
| 84 |
+
```bash
|
| 85 |
+
python inference.py \
|
| 86 |
+
--dataset-path dataset/py_tasks.csv \
|
| 87 |
+
--episodes-per-task 5 \
|
| 88 |
+
--task-types classify,root_cause \
|
| 89 |
+
--trace-agent --trace-prompts > agent_trace.log 2>&1
|
| 90 |
+
```
|
| 91 |
+
|
| 92 |
+
## OpenEnv CLI
|
| 93 |
+
|
| 94 |
+
```bash
|
| 95 |
+
openenv/bin/openenv validate --json
|
| 96 |
+
openenv/bin/openenv build
|
| 97 |
+
openenv/bin/openenv push
|
| 98 |
+
```
|
__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from client import FlakySleuthClient
|
| 2 |
+
from env.environment import FlakySleuthEnv
|
| 3 |
+
from models import FlakySleuthAction, FlakySleuthObservation, FlakySleuthReward
|
| 4 |
+
|
| 5 |
+
__all__ = [
|
| 6 |
+
"FlakySleuthClient",
|
| 7 |
+
"FlakySleuthEnv",
|
| 8 |
+
"FlakySleuthAction",
|
| 9 |
+
"FlakySleuthObservation",
|
| 10 |
+
"FlakySleuthReward",
|
| 11 |
+
]
|
client.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass
|
| 4 |
+
from typing import Any
|
| 5 |
+
|
| 6 |
+
import requests
|
| 7 |
+
|
| 8 |
+
from env.models import FlakySleuthAction
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@dataclass
|
| 12 |
+
class FlakySleuthClient:
|
| 13 |
+
base_url: str
|
| 14 |
+
timeout_s: float = 30.0
|
| 15 |
+
|
| 16 |
+
def reset(self) -> dict[str, Any]:
|
| 17 |
+
response = requests.post(f"{self.base_url.rstrip('/')}/reset", timeout=self.timeout_s)
|
| 18 |
+
response.raise_for_status()
|
| 19 |
+
return response.json()
|
| 20 |
+
|
| 21 |
+
def step(self, action: FlakySleuthAction) -> dict[str, Any]:
|
| 22 |
+
payload = {"action": action.model_dump()}
|
| 23 |
+
response = requests.post(
|
| 24 |
+
f"{self.base_url.rstrip('/')}/step",
|
| 25 |
+
json=payload,
|
| 26 |
+
timeout=self.timeout_s,
|
| 27 |
+
)
|
| 28 |
+
response.raise_for_status()
|
| 29 |
+
return response.json()
|
| 30 |
+
|
| 31 |
+
def state(self) -> dict[str, Any]:
|
| 32 |
+
response = requests.get(f"{self.base_url.rstrip('/')}/state", timeout=self.timeout_s)
|
| 33 |
+
response.raise_for_status()
|
| 34 |
+
return response.json()
|
dataset/build_dataset.py
ADDED
|
@@ -0,0 +1,461 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Offline dataset builder for FlakySleuth.
|
| 2 |
+
|
| 3 |
+
Examples:
|
| 4 |
+
# Validate schema and show category/status summary only
|
| 5 |
+
python dataset/build_dataset.py --input py-data.csv --validate-only
|
| 6 |
+
|
| 7 |
+
# Build full task CSV (requires network access for repo cloning)
|
| 8 |
+
export GITHUB_TOKEN=...
|
| 9 |
+
python dataset/build_dataset.py --input py-data.csv --output dataset/py_tasks.csv
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
import argparse
|
| 15 |
+
import csv
|
| 16 |
+
import os
|
| 17 |
+
import subprocess
|
| 18 |
+
import tempfile
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
from urllib.parse import urlparse
|
| 21 |
+
|
| 22 |
+
import pandas as pd
|
| 23 |
+
import requests
|
| 24 |
+
|
| 25 |
+
try:
|
| 26 |
+
from tqdm import tqdm
|
| 27 |
+
except Exception: # pragma: no cover
|
| 28 |
+
tqdm = None
|
| 29 |
+
|
| 30 |
+
TASK12_CATEGORIES = ["NOD", "TD", "TZD", "NIO", "ID", "OD", "OD-Brit", "OD-Vic"]
|
| 31 |
+
TASK3_CATEGORIES = ["TD", "TZD", "NOD", "NIO", "ID"]
|
| 32 |
+
|
| 33 |
+
PROJECT_URL_COL = "Project URL"
|
| 34 |
+
SHA_COL = "SHA Detected"
|
| 35 |
+
CATEGORY_COL = "Category"
|
| 36 |
+
STATUS_COL = "Status"
|
| 37 |
+
PR_LINK_COL = "PR Link"
|
| 38 |
+
NOTES_COL = "Notes"
|
| 39 |
+
TEST_NAME_ALIASES = [
|
| 40 |
+
"Pytest Test Name",
|
| 41 |
+
"Pytest Test Name (PathToFile::TestClass::TestMethod or PathToFile::TestMethod)",
|
| 42 |
+
]
|
| 43 |
+
OUTPUT_COLUMNS = [
|
| 44 |
+
"repo_url",
|
| 45 |
+
"sha",
|
| 46 |
+
"test_name",
|
| 47 |
+
"test_file",
|
| 48 |
+
"category",
|
| 49 |
+
"label",
|
| 50 |
+
"status",
|
| 51 |
+
"pr_link",
|
| 52 |
+
"task_types",
|
| 53 |
+
"test_code",
|
| 54 |
+
"known_fix_diff",
|
| 55 |
+
]
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def _normalize_header(text: str) -> str:
|
| 59 |
+
return " ".join(str(text).strip().split())
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def _resolve_test_name_column(columns: list[str]) -> str:
|
| 63 |
+
normalized = {_normalize_header(c): c for c in columns}
|
| 64 |
+
for alias in TEST_NAME_ALIASES:
|
| 65 |
+
key = _normalize_header(alias)
|
| 66 |
+
if key in normalized:
|
| 67 |
+
return normalized[key]
|
| 68 |
+
raise KeyError(
|
| 69 |
+
"Could not find pytest test-name column. Expected one of: "
|
| 70 |
+
+ ", ".join(TEST_NAME_ALIASES)
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def _parse_pr_link(pr_link: str) -> tuple[str, str] | None:
|
| 75 |
+
"""Return (owner/repo, number) from URL or owner/repo#number."""
|
| 76 |
+
value = (pr_link or "").strip()
|
| 77 |
+
if not value or value.lower() == "nan":
|
| 78 |
+
return None
|
| 79 |
+
|
| 80 |
+
if value.startswith("http://") or value.startswith("https://"):
|
| 81 |
+
parsed = urlparse(value)
|
| 82 |
+
parts = [p for p in parsed.path.split("/") if p]
|
| 83 |
+
# Expected: /owner/repo/pull/number
|
| 84 |
+
if len(parts) >= 4 and parts[2] == "pull" and parts[3].isdigit():
|
| 85 |
+
return f"{parts[0]}/{parts[1]}", parts[3]
|
| 86 |
+
return None
|
| 87 |
+
|
| 88 |
+
if "#" in value:
|
| 89 |
+
repo, number = value.split("#", 1)
|
| 90 |
+
if repo.strip() and number.strip().isdigit():
|
| 91 |
+
return repo.strip(), number.strip()
|
| 92 |
+
return None
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def _is_accepted_status(status: str) -> bool:
|
| 96 |
+
value = (status or "").strip().lower()
|
| 97 |
+
return value in {"accepted", "merged", "fixed"}
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def _non_interactive_git_env() -> dict[str, str]:
|
| 101 |
+
env = os.environ.copy()
|
| 102 |
+
# Never block on credential prompts while iterating large public datasets.
|
| 103 |
+
env["GIT_TERMINAL_PROMPT"] = "0"
|
| 104 |
+
env["GCM_INTERACTIVE"] = "Never"
|
| 105 |
+
return env
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def _has_value(value: str) -> bool:
|
| 109 |
+
text = str(value or "").strip().lower()
|
| 110 |
+
return text not in {"", "nan", "none"}
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def _is_non_unmaintained_status(status: str) -> bool:
|
| 114 |
+
value = str(status or "").strip().lower()
|
| 115 |
+
return value not in {"", "nan", "none", "unmaintained"}
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def _row_preference_rank(row_out: dict[str, str]) -> tuple[int, int, int]:
|
| 119 |
+
task_tokens = {t.strip() for t in str(row_out.get("task_types", "")).split(";") if t.strip()}
|
| 120 |
+
return (
|
| 121 |
+
1 if "fix_proposal" in task_tokens else 0,
|
| 122 |
+
1 if _has_value(str(row_out.get("pr_link", ""))) else 0,
|
| 123 |
+
1 if _is_non_unmaintained_status(str(row_out.get("status", ""))) else 0,
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def fetch_test_code(repo_url: str, sha: str, pytest_test_name: str) -> tuple[str, str, str]:
|
| 128 |
+
test_file = pytest_test_name.split("::")[0]
|
| 129 |
+
git_env = _non_interactive_git_env()
|
| 130 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 131 |
+
try:
|
| 132 |
+
init = subprocess.run(
|
| 133 |
+
["git", "init", tmpdir],
|
| 134 |
+
capture_output=True,
|
| 135 |
+
text=True,
|
| 136 |
+
check=False,
|
| 137 |
+
timeout=20,
|
| 138 |
+
env=git_env,
|
| 139 |
+
stdin=subprocess.DEVNULL,
|
| 140 |
+
)
|
| 141 |
+
if init.returncode != 0:
|
| 142 |
+
return "", "git_init_failed", (init.stderr or init.stdout or "").strip()[:200]
|
| 143 |
+
|
| 144 |
+
remote = subprocess.run(
|
| 145 |
+
["git", "-C", tmpdir, "remote", "add", "origin", repo_url],
|
| 146 |
+
capture_output=True,
|
| 147 |
+
text=True,
|
| 148 |
+
check=False,
|
| 149 |
+
timeout=10,
|
| 150 |
+
env=git_env,
|
| 151 |
+
stdin=subprocess.DEVNULL,
|
| 152 |
+
)
|
| 153 |
+
if remote.returncode != 0:
|
| 154 |
+
return "", "git_remote_add_failed", (remote.stderr or remote.stdout or "").strip()[:200]
|
| 155 |
+
|
| 156 |
+
# Fetch only the requested commit for speed and correctness.
|
| 157 |
+
fetch = subprocess.run(
|
| 158 |
+
["git", "-C", tmpdir, "fetch", "--depth=1", "origin", sha],
|
| 159 |
+
capture_output=True,
|
| 160 |
+
text=True,
|
| 161 |
+
check=False,
|
| 162 |
+
timeout=90,
|
| 163 |
+
env=git_env,
|
| 164 |
+
stdin=subprocess.DEVNULL,
|
| 165 |
+
)
|
| 166 |
+
if fetch.returncode != 0:
|
| 167 |
+
return "", "git_fetch_sha_failed", (fetch.stderr or fetch.stdout or "").strip()[:200]
|
| 168 |
+
|
| 169 |
+
checkout = subprocess.run(
|
| 170 |
+
["git", "-C", tmpdir, "checkout", "--detach", "FETCH_HEAD"],
|
| 171 |
+
capture_output=True,
|
| 172 |
+
text=True,
|
| 173 |
+
check=False,
|
| 174 |
+
timeout=30,
|
| 175 |
+
env=git_env,
|
| 176 |
+
stdin=subprocess.DEVNULL,
|
| 177 |
+
)
|
| 178 |
+
if checkout.returncode != 0:
|
| 179 |
+
return "", "git_checkout_failed", (checkout.stderr or checkout.stdout or "").strip()[:200]
|
| 180 |
+
except subprocess.TimeoutExpired:
|
| 181 |
+
return "", "git_timeout", "timeout"
|
| 182 |
+
|
| 183 |
+
file_path = Path(tmpdir) / test_file
|
| 184 |
+
if not file_path.exists():
|
| 185 |
+
return "", "test_file_missing_at_sha", test_file
|
| 186 |
+
return file_path.read_text(encoding="utf-8", errors="replace")[:10000], "", ""
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
def fetch_pr_diff(pr_link: str, github_token: str) -> str:
|
| 190 |
+
parsed = _parse_pr_link(pr_link)
|
| 191 |
+
if not parsed:
|
| 192 |
+
return ""
|
| 193 |
+
|
| 194 |
+
repo, number = parsed
|
| 195 |
+
url = f"https://api.github.com/repos/{repo}/pulls/{number}"
|
| 196 |
+
headers = {
|
| 197 |
+
"Authorization": f"token {github_token}",
|
| 198 |
+
"Accept": "application/vnd.github.diff",
|
| 199 |
+
}
|
| 200 |
+
response = requests.get(url, headers=headers, timeout=15)
|
| 201 |
+
if response.status_code == 200:
|
| 202 |
+
return response.text[:3000]
|
| 203 |
+
return ""
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
def _validate_schema(input_csv: str) -> tuple[pd.DataFrame, str]:
|
| 207 |
+
df = pd.read_csv(input_csv)
|
| 208 |
+
df.columns = [_normalize_header(col) for col in df.columns]
|
| 209 |
+
|
| 210 |
+
missing = []
|
| 211 |
+
for required in [PROJECT_URL_COL, SHA_COL, CATEGORY_COL, STATUS_COL, PR_LINK_COL]:
|
| 212 |
+
if required not in df.columns:
|
| 213 |
+
missing.append(required)
|
| 214 |
+
if missing:
|
| 215 |
+
raise KeyError(f"Missing required columns: {missing}")
|
| 216 |
+
|
| 217 |
+
test_name_col = _resolve_test_name_column(list(df.columns))
|
| 218 |
+
return df, test_name_col
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
def _print_input_summary(df: pd.DataFrame, test_name_col: str) -> None:
|
| 222 |
+
print("Input schema check: OK")
|
| 223 |
+
print(f"Rows: {len(df)}")
|
| 224 |
+
print(f"Using test-name column: {test_name_col}")
|
| 225 |
+
print("Columns:", list(df.columns))
|
| 226 |
+
print("\nCategory distribution (top 20):")
|
| 227 |
+
print(df[CATEGORY_COL].fillna("").astype(str).value_counts().head(20))
|
| 228 |
+
print("\nStatus distribution:")
|
| 229 |
+
print(df[STATUS_COL].fillna("").astype(str).value_counts().head(20))
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
def build(
|
| 233 |
+
input_csv: str,
|
| 234 |
+
output_csv: str,
|
| 235 |
+
github_token: str,
|
| 236 |
+
*,
|
| 237 |
+
validate_only: bool = False,
|
| 238 |
+
limit: int | None = None,
|
| 239 |
+
) -> None:
|
| 240 |
+
df, test_name_col = _validate_schema(input_csv)
|
| 241 |
+
_print_input_summary(df, test_name_col)
|
| 242 |
+
if validate_only:
|
| 243 |
+
return
|
| 244 |
+
|
| 245 |
+
total_rows = min(len(df), limit) if limit is not None else len(df)
|
| 246 |
+
print(
|
| 247 |
+
f"\nStarting build over {total_rows} rows "
|
| 248 |
+
f"(this can take a while: cloning repos + reading files + optional PR diff fetch)"
|
| 249 |
+
)
|
| 250 |
+
|
| 251 |
+
stats: dict[str, int] = {
|
| 252 |
+
"kept": 0,
|
| 253 |
+
"kept_unique": 0,
|
| 254 |
+
"skipped_missing_core_fields": 0,
|
| 255 |
+
"skipped_ud": 0,
|
| 256 |
+
"skipped_no_task_types": 0,
|
| 257 |
+
"skipped_test_code_fetch_failed": 0,
|
| 258 |
+
"skipped_test_code_fetch_git_fail": 0,
|
| 259 |
+
"skipped_test_code_fetch_file_missing": 0,
|
| 260 |
+
"fix_diff_fetched": 0,
|
| 261 |
+
"duplicate_key_rows_seen": 0,
|
| 262 |
+
"duplicate_key_replaced": 0,
|
| 263 |
+
"duplicate_key_kept_existing": 0,
|
| 264 |
+
}
|
| 265 |
+
fetch_fail_examples: list[dict[str, str]] = []
|
| 266 |
+
canonical_rows: dict[tuple[str, str, str], dict[str, str]] = {}
|
| 267 |
+
output_path = Path(output_csv)
|
| 268 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 269 |
+
iterator = df.iterrows()
|
| 270 |
+
if tqdm is not None:
|
| 271 |
+
iterator = tqdm(iterator, total=total_rows, desc="Building tasks", unit="row")
|
| 272 |
+
|
| 273 |
+
with output_path.open("w", encoding="utf-8", newline="") as out_fp:
|
| 274 |
+
writer = csv.DictWriter(out_fp, fieldnames=OUTPUT_COLUMNS, extrasaction="ignore")
|
| 275 |
+
writer.writeheader()
|
| 276 |
+
out_fp.flush()
|
| 277 |
+
|
| 278 |
+
processed = 0
|
| 279 |
+
for idx, (_, row) in enumerate(iterator, start=1):
|
| 280 |
+
if idx > total_rows:
|
| 281 |
+
break
|
| 282 |
+
processed = idx
|
| 283 |
+
|
| 284 |
+
repo_url = str(row.get(PROJECT_URL_COL, "")).strip()
|
| 285 |
+
sha = str(row.get(SHA_COL, "")).strip()
|
| 286 |
+
test_name = str(row.get(test_name_col, "")).strip()
|
| 287 |
+
category_raw = str(row.get(CATEGORY_COL, "")).strip()
|
| 288 |
+
status = str(row.get(STATUS_COL, "")).strip()
|
| 289 |
+
pr_link = str(row.get(PR_LINK_COL, "")).strip()
|
| 290 |
+
|
| 291 |
+
if not repo_url or not sha or not test_name or not category_raw:
|
| 292 |
+
stats["skipped_missing_core_fields"] += 1
|
| 293 |
+
_update_progress(iterator, tqdm, stats)
|
| 294 |
+
continue
|
| 295 |
+
|
| 296 |
+
category = category_raw.split(";")[0].strip()
|
| 297 |
+
if category == "UD":
|
| 298 |
+
stats["skipped_ud"] += 1
|
| 299 |
+
_update_progress(iterator, tqdm, stats)
|
| 300 |
+
continue
|
| 301 |
+
|
| 302 |
+
task_types: list[str] = []
|
| 303 |
+
if category in TASK12_CATEGORIES:
|
| 304 |
+
task_types.extend(["classify", "root_cause"])
|
| 305 |
+
if category in TASK3_CATEGORIES and _is_accepted_status(status) and _parse_pr_link(pr_link):
|
| 306 |
+
task_types.append("fix_proposal")
|
| 307 |
+
|
| 308 |
+
if not task_types:
|
| 309 |
+
stats["skipped_no_task_types"] += 1
|
| 310 |
+
_update_progress(iterator, tqdm, stats)
|
| 311 |
+
continue
|
| 312 |
+
|
| 313 |
+
test_code, fetch_reason, fetch_detail = fetch_test_code(repo_url, sha, test_name)
|
| 314 |
+
if not test_code:
|
| 315 |
+
stats["skipped_test_code_fetch_failed"] += 1
|
| 316 |
+
if fetch_reason in {
|
| 317 |
+
"git_init_failed",
|
| 318 |
+
"git_remote_add_failed",
|
| 319 |
+
"git_fetch_sha_failed",
|
| 320 |
+
"git_checkout_failed",
|
| 321 |
+
"git_timeout",
|
| 322 |
+
}:
|
| 323 |
+
stats["skipped_test_code_fetch_git_fail"] += 1
|
| 324 |
+
if fetch_reason == "test_file_missing_at_sha":
|
| 325 |
+
stats["skipped_test_code_fetch_file_missing"] += 1
|
| 326 |
+
if len(fetch_fail_examples) < 10:
|
| 327 |
+
fetch_fail_examples.append(
|
| 328 |
+
{
|
| 329 |
+
"repo_url": repo_url,
|
| 330 |
+
"sha": sha,
|
| 331 |
+
"test_name": test_name,
|
| 332 |
+
"reason": fetch_reason,
|
| 333 |
+
"detail": fetch_detail,
|
| 334 |
+
}
|
| 335 |
+
)
|
| 336 |
+
_update_progress(iterator, tqdm, stats)
|
| 337 |
+
continue
|
| 338 |
+
|
| 339 |
+
known_fix_diff = ""
|
| 340 |
+
if "fix_proposal" in task_types and github_token:
|
| 341 |
+
known_fix_diff = fetch_pr_diff(pr_link, github_token)
|
| 342 |
+
if known_fix_diff:
|
| 343 |
+
stats["fix_diff_fetched"] += 1
|
| 344 |
+
|
| 345 |
+
row_out = {
|
| 346 |
+
"repo_url": repo_url,
|
| 347 |
+
"sha": sha,
|
| 348 |
+
"test_name": test_name,
|
| 349 |
+
"test_file": test_name.split("::")[0],
|
| 350 |
+
"category": category,
|
| 351 |
+
"label": "flaky",
|
| 352 |
+
"status": status,
|
| 353 |
+
"pr_link": pr_link,
|
| 354 |
+
"task_types": ";".join(task_types),
|
| 355 |
+
"test_code": test_code,
|
| 356 |
+
"known_fix_diff": known_fix_diff,
|
| 357 |
+
}
|
| 358 |
+
writer.writerow(row_out)
|
| 359 |
+
out_fp.flush()
|
| 360 |
+
stats["kept"] += 1
|
| 361 |
+
|
| 362 |
+
row_key = (
|
| 363 |
+
row_out["repo_url"],
|
| 364 |
+
row_out["sha"],
|
| 365 |
+
row_out["test_name"],
|
| 366 |
+
)
|
| 367 |
+
if row_key not in canonical_rows:
|
| 368 |
+
canonical_rows[row_key] = row_out
|
| 369 |
+
else:
|
| 370 |
+
stats["duplicate_key_rows_seen"] += 1
|
| 371 |
+
current = canonical_rows[row_key]
|
| 372 |
+
if _row_preference_rank(row_out) > _row_preference_rank(current):
|
| 373 |
+
canonical_rows[row_key] = row_out
|
| 374 |
+
stats["duplicate_key_replaced"] += 1
|
| 375 |
+
else:
|
| 376 |
+
stats["duplicate_key_kept_existing"] += 1
|
| 377 |
+
_update_progress(iterator, tqdm, stats, processed, total_rows)
|
| 378 |
+
|
| 379 |
+
out = pd.DataFrame(list(canonical_rows.values()), columns=OUTPUT_COLUMNS)
|
| 380 |
+
stats["kept_unique"] = len(out)
|
| 381 |
+
out.to_csv(output_csv, index=False)
|
| 382 |
+
|
| 383 |
+
if tqdm is None:
|
| 384 |
+
print()
|
| 385 |
+
|
| 386 |
+
print("\nBuild summary:")
|
| 387 |
+
for key, value in stats.items():
|
| 388 |
+
print(f" {key}: {value}")
|
| 389 |
+
print(f"Built {len(out)} task rows -> {output_csv}")
|
| 390 |
+
if fetch_fail_examples:
|
| 391 |
+
print("\nSample fetch failures (first 10):")
|
| 392 |
+
for i, sample in enumerate(fetch_fail_examples, start=1):
|
| 393 |
+
print(
|
| 394 |
+
f" {i}. reason={sample['reason']} "
|
| 395 |
+
f"repo={sample['repo_url']} sha={sample['sha']} "
|
| 396 |
+
f"test={sample['test_name']} detail={sample['detail']}"
|
| 397 |
+
)
|
| 398 |
+
if len(out):
|
| 399 |
+
print(out["category"].value_counts())
|
| 400 |
+
print(out["task_types"].value_counts())
|
| 401 |
+
|
| 402 |
+
|
| 403 |
+
def _update_progress(
|
| 404 |
+
iterator,
|
| 405 |
+
tqdm_mod,
|
| 406 |
+
stats: dict[str, int],
|
| 407 |
+
processed: int | None = None,
|
| 408 |
+
total_rows: int | None = None,
|
| 409 |
+
) -> None:
|
| 410 |
+
if tqdm_mod is not None and hasattr(iterator, "set_postfix"):
|
| 411 |
+
iterator.set_postfix(
|
| 412 |
+
kept=stats["kept"],
|
| 413 |
+
miss=stats["skipped_missing_core_fields"],
|
| 414 |
+
ud=stats["skipped_ud"],
|
| 415 |
+
no_task=stats["skipped_no_task_types"],
|
| 416 |
+
fetch_fail=stats["skipped_test_code_fetch_failed"],
|
| 417 |
+
)
|
| 418 |
+
return
|
| 419 |
+
|
| 420 |
+
if processed is None or total_rows is None:
|
| 421 |
+
return
|
| 422 |
+
if processed == 1 or processed % 20 == 0 or processed == total_rows:
|
| 423 |
+
print(
|
| 424 |
+
f"\r[{processed}/{total_rows}] "
|
| 425 |
+
f"kept={stats['kept']} "
|
| 426 |
+
f"fetch_fail={stats['skipped_test_code_fetch_failed']} "
|
| 427 |
+
f"no_task={stats['skipped_no_task_types']}",
|
| 428 |
+
end="",
|
| 429 |
+
flush=True,
|
| 430 |
+
)
|
| 431 |
+
|
| 432 |
+
|
| 433 |
+
def main() -> None:
|
| 434 |
+
parser = argparse.ArgumentParser(description="Build FlakySleuth task dataset")
|
| 435 |
+
parser.add_argument("--input", default="idoft/py-data.csv", help="Path to IDoFT py-data.csv")
|
| 436 |
+
parser.add_argument("--output", default="dataset/py_tasks.csv", help="Output CSV path")
|
| 437 |
+
parser.add_argument(
|
| 438 |
+
"--validate-only",
|
| 439 |
+
action="store_true",
|
| 440 |
+
help="Validate input schema and print summary, without cloning/fetching.",
|
| 441 |
+
)
|
| 442 |
+
parser.add_argument(
|
| 443 |
+
"--limit",
|
| 444 |
+
type=int,
|
| 445 |
+
default=None,
|
| 446 |
+
help="Optional max input rows to process (useful for quick sanity checks).",
|
| 447 |
+
)
|
| 448 |
+
args = parser.parse_args()
|
| 449 |
+
|
| 450 |
+
github_token = os.environ.get("GITHUB_TOKEN", "")
|
| 451 |
+
build(
|
| 452 |
+
args.input,
|
| 453 |
+
args.output,
|
| 454 |
+
github_token,
|
| 455 |
+
validate_only=args.validate_only,
|
| 456 |
+
limit=args.limit,
|
| 457 |
+
)
|
| 458 |
+
|
| 459 |
+
|
| 460 |
+
if __name__ == "__main__":
|
| 461 |
+
main()
|
dataset/category_similarity.json
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"OD,OD-Brit": 0.7,
|
| 3 |
+
"OD,OD-Vic": 0.7,
|
| 4 |
+
"OD-Brit,OD-Vic": 0.8,
|
| 5 |
+
"OD,NIO": 0.4,
|
| 6 |
+
"OD,NDOI": 0.3,
|
| 7 |
+
"NOD,TD": 0.6,
|
| 8 |
+
"NOD,TZD": 0.5,
|
| 9 |
+
"NOD,NDOI": 0.5,
|
| 10 |
+
"TD,TZD": 0.7,
|
| 11 |
+
"NOD,ID": 0.3,
|
| 12 |
+
"UD,OD": 0.2,
|
| 13 |
+
"UD,NOD": 0.2,
|
| 14 |
+
"UD,NIO": 0.2,
|
| 15 |
+
"UD,TD": 0.2,
|
| 16 |
+
"UD,ID": 0.2
|
| 17 |
+
}
|
dataset/fixtures/toy_project/src/math_utils.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import random
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def unstable_sum(values):
|
| 5 |
+
random.shuffle(values)
|
| 6 |
+
return values[0] + values[1]
|
dataset/fixtures/toy_project/tests/test_flaky.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src.math_utils import unstable_sum
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def test_randomized_total():
|
| 5 |
+
values = [1, 2, 3]
|
| 6 |
+
total = unstable_sum(values)
|
| 7 |
+
assert total == 3
|
env/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from env.environment import FlakySleuthEnv
|
| 2 |
+
from env.models import FlakySleuthAction, FlakySleuthObservation, FlakySleuthReward
|
| 3 |
+
|
| 4 |
+
__all__ = ["FlakySleuthEnv", "FlakySleuthAction", "FlakySleuthObservation", "FlakySleuthReward"]
|
env/environment.py
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import Any
|
| 4 |
+
|
| 5 |
+
from env.models import FlakySleuthAction, FlakySleuthObservation
|
| 6 |
+
from env.sandbox import Sandbox
|
| 7 |
+
from env.task_loader import TaskLoader
|
| 8 |
+
from graders import grade_action
|
| 9 |
+
|
| 10 |
+
FLAKY_SIGNAL_PATTERNS = [
|
| 11 |
+
"sleep",
|
| 12 |
+
"random",
|
| 13 |
+
"time",
|
| 14 |
+
"datetime",
|
| 15 |
+
"thread",
|
| 16 |
+
"asyncio",
|
| 17 |
+
"fixture",
|
| 18 |
+
"setup",
|
| 19 |
+
"teardown",
|
| 20 |
+
"global",
|
| 21 |
+
"shared",
|
| 22 |
+
"singleton",
|
| 23 |
+
"os.environ",
|
| 24 |
+
"socket",
|
| 25 |
+
"timeout",
|
| 26 |
+
"retry",
|
| 27 |
+
"mock",
|
| 28 |
+
"patch",
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
TERMINAL_ACTIONS = ("classify_flakiness", "classify_root_cause", "propose_fix")
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class FlakySleuthEnv:
|
| 35 |
+
def __init__(self, dataset_path: str = "dataset/py_tasks.csv", max_steps: int = 20):
|
| 36 |
+
self.loader = TaskLoader(dataset_path)
|
| 37 |
+
self.sandbox: Sandbox | None = None
|
| 38 |
+
self.current_task: dict[str, Any] | None = None
|
| 39 |
+
self.step_count = 0
|
| 40 |
+
self.max_steps = max_steps
|
| 41 |
+
self.cumulative_progress = 0.0
|
| 42 |
+
self.files_read: set[str] = set()
|
| 43 |
+
self.episode_actions: list[FlakySleuthAction] = []
|
| 44 |
+
|
| 45 |
+
def reset(self) -> FlakySleuthObservation:
|
| 46 |
+
if self.sandbox:
|
| 47 |
+
self.sandbox.cleanup()
|
| 48 |
+
|
| 49 |
+
self.current_task = self.loader.sample()
|
| 50 |
+
self.current_task.setdefault("label", "flaky")
|
| 51 |
+
|
| 52 |
+
self.sandbox = Sandbox(self.current_task)
|
| 53 |
+
self.sandbox.setup()
|
| 54 |
+
|
| 55 |
+
self.current_task["sandbox_root"] = self.sandbox.tmpdir or ""
|
| 56 |
+
test_file = self.current_task.get("test_file", "")
|
| 57 |
+
if test_file and self.sandbox.tmpdir:
|
| 58 |
+
self.current_task["sandbox_test_path"] = f"{self.sandbox.tmpdir}/{test_file}"
|
| 59 |
+
|
| 60 |
+
self.step_count = 0
|
| 61 |
+
self.cumulative_progress = 0.0
|
| 62 |
+
self.files_read = set()
|
| 63 |
+
self.episode_actions = []
|
| 64 |
+
|
| 65 |
+
return self._make_obs()
|
| 66 |
+
|
| 67 |
+
def step(self, action: FlakySleuthAction):
|
| 68 |
+
if not self.current_task or not self.sandbox:
|
| 69 |
+
raise RuntimeError("Environment is not initialized. Call reset() first.")
|
| 70 |
+
|
| 71 |
+
self.step_count += 1
|
| 72 |
+
self.episode_actions.append(action)
|
| 73 |
+
|
| 74 |
+
tool_output: str | None = None
|
| 75 |
+
reward = 0.0
|
| 76 |
+
done = False
|
| 77 |
+
info: dict[str, Any] = {}
|
| 78 |
+
|
| 79 |
+
if action.action_type in TERMINAL_ACTIONS:
|
| 80 |
+
terminal_score = grade_action(action, self.current_task)
|
| 81 |
+
late_penalty = max(0, self.step_count - 15) * 0.05
|
| 82 |
+
|
| 83 |
+
wrong_dir_penalty = 0.0
|
| 84 |
+
if (
|
| 85 |
+
action.action_type == "classify_flakiness"
|
| 86 |
+
and action.argument.strip().lower() == "stable"
|
| 87 |
+
and str(self.current_task.get("label", "flaky")).lower() == "flaky"
|
| 88 |
+
):
|
| 89 |
+
wrong_dir_penalty = 0.2
|
| 90 |
+
|
| 91 |
+
reward = min(
|
| 92 |
+
1.0,
|
| 93 |
+
max(
|
| 94 |
+
0.0,
|
| 95 |
+
self.cumulative_progress + terminal_score - late_penalty - wrong_dir_penalty,
|
| 96 |
+
),
|
| 97 |
+
)
|
| 98 |
+
done = True
|
| 99 |
+
info = {
|
| 100 |
+
"terminal_score": terminal_score,
|
| 101 |
+
"progress_score": self.cumulative_progress,
|
| 102 |
+
"late_penalty": late_penalty,
|
| 103 |
+
"task_type": self.current_task.get("task_type"),
|
| 104 |
+
"category": self.current_task.get("category"),
|
| 105 |
+
}
|
| 106 |
+
else:
|
| 107 |
+
tool_output, progress = self._execute_exploration(action)
|
| 108 |
+
self.cumulative_progress = min(0.30, max(0.0, self.cumulative_progress + progress))
|
| 109 |
+
reward = progress
|
| 110 |
+
|
| 111 |
+
if not done and self.step_count >= self.max_steps:
|
| 112 |
+
done = True
|
| 113 |
+
info = {
|
| 114 |
+
"terminal_score": 0.0,
|
| 115 |
+
"progress_score": self.cumulative_progress,
|
| 116 |
+
"late_penalty": max(0, self.step_count - 15) * 0.05,
|
| 117 |
+
"timeout": True,
|
| 118 |
+
"task_type": self.current_task.get("task_type"),
|
| 119 |
+
"category": self.current_task.get("category"),
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
obs = self._make_obs(tool_output)
|
| 123 |
+
return obs, reward, done, info
|
| 124 |
+
|
| 125 |
+
def state(self) -> dict[str, Any]:
|
| 126 |
+
if not self.current_task:
|
| 127 |
+
return {
|
| 128 |
+
"repo_url": None,
|
| 129 |
+
"test_name": None,
|
| 130 |
+
"task_type": None,
|
| 131 |
+
"step_count": self.step_count,
|
| 132 |
+
"files_read": [],
|
| 133 |
+
"cumulative_progress": self.cumulative_progress,
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
return {
|
| 137 |
+
"repo_url": self.current_task.get("repo_url"),
|
| 138 |
+
"test_name": self.current_task.get("test_name"),
|
| 139 |
+
"task_type": self.current_task.get("task_type"),
|
| 140 |
+
"step_count": self.step_count,
|
| 141 |
+
"files_read": sorted(self.files_read),
|
| 142 |
+
"cumulative_progress": self.cumulative_progress,
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
def close(self) -> None:
|
| 146 |
+
if self.sandbox:
|
| 147 |
+
self.sandbox.cleanup()
|
| 148 |
+
self.sandbox = None
|
| 149 |
+
|
| 150 |
+
def _execute_exploration(self, action: FlakySleuthAction) -> tuple[str, float]:
|
| 151 |
+
assert self.current_task is not None
|
| 152 |
+
assert self.sandbox is not None
|
| 153 |
+
|
| 154 |
+
progress = 0.0
|
| 155 |
+
output = ""
|
| 156 |
+
|
| 157 |
+
if action.action_type == "read_file":
|
| 158 |
+
content = self.sandbox.read_file(action.argument)
|
| 159 |
+
if content is None:
|
| 160 |
+
output = f"ERROR: File not found: {action.argument}"
|
| 161 |
+
progress = -0.05
|
| 162 |
+
elif action.argument in self.files_read:
|
| 163 |
+
output = content
|
| 164 |
+
progress = 0.0
|
| 165 |
+
else:
|
| 166 |
+
self.files_read.add(action.argument)
|
| 167 |
+
output = content
|
| 168 |
+
progress = self._file_relevance_reward(action.argument)
|
| 169 |
+
|
| 170 |
+
elif action.action_type == "search_code":
|
| 171 |
+
output = self.sandbox.grep(action.argument)
|
| 172 |
+
progress = self._search_relevance_reward(action.argument)
|
| 173 |
+
|
| 174 |
+
elif action.action_type == "run_test":
|
| 175 |
+
output = self.sandbox.run_test(self.current_task.get("test_name", ""))
|
| 176 |
+
category = str(self.current_task.get("category", "")).strip()
|
| 177 |
+
if category not in ("OD", "OD-Brit", "OD-Vic"):
|
| 178 |
+
progress = 0.05
|
| 179 |
+
else:
|
| 180 |
+
output = f"ERROR: Unsupported action_type {action.action_type}"
|
| 181 |
+
progress = -0.05
|
| 182 |
+
|
| 183 |
+
return output, progress
|
| 184 |
+
|
| 185 |
+
def _file_relevance_reward(self, filepath: str) -> float:
|
| 186 |
+
assert self.current_task is not None
|
| 187 |
+
|
| 188 |
+
test_file = str(self.current_task.get("test_file", ""))
|
| 189 |
+
if test_file and test_file in filepath:
|
| 190 |
+
return 0.07
|
| 191 |
+
if filepath.endswith(".py"):
|
| 192 |
+
return 0.03
|
| 193 |
+
return 0.01
|
| 194 |
+
|
| 195 |
+
def _search_relevance_reward(self, pattern: str) -> float:
|
| 196 |
+
pattern_lower = pattern.lower()
|
| 197 |
+
if any(signal in pattern_lower for signal in FLAKY_SIGNAL_PATTERNS):
|
| 198 |
+
return 0.04
|
| 199 |
+
return 0.01
|
| 200 |
+
|
| 201 |
+
def _make_obs(self, tool_output: str | None = None) -> FlakySleuthObservation:
|
| 202 |
+
if not self.current_task:
|
| 203 |
+
raise RuntimeError("No current task available")
|
| 204 |
+
|
| 205 |
+
return FlakySleuthObservation(
|
| 206 |
+
repo_url=str(self.current_task.get("repo_url", "")),
|
| 207 |
+
test_name=str(self.current_task.get("test_name", "")),
|
| 208 |
+
test_code=str(self.current_task.get("test_code", ""))[:2000],
|
| 209 |
+
file_tree=self.sandbox.file_tree if self.sandbox else [],
|
| 210 |
+
tool_output=tool_output,
|
| 211 |
+
task_type=str(self.current_task.get("task_type", "classify")),
|
| 212 |
+
task_description=str(self.current_task.get("task_description", "Investigate the flaky test.")),
|
| 213 |
+
step_count=self.step_count,
|
| 214 |
+
done=False,
|
| 215 |
+
reward=None,
|
| 216 |
+
)
|
env/models.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import Any, Literal
|
| 4 |
+
|
| 5 |
+
from pydantic import BaseModel, Field
|
| 6 |
+
|
| 7 |
+
try:
|
| 8 |
+
from openenv.core.env_server.types import Action, Observation
|
| 9 |
+
except Exception: # pragma: no cover
|
| 10 |
+
Action = BaseModel # type: ignore[misc,assignment]
|
| 11 |
+
Observation = BaseModel # type: ignore[misc,assignment]
|
| 12 |
+
|
| 13 |
+
TaskType = Literal["classify", "root_cause", "fix_proposal"]
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class FlakySleuthObservation(Observation):
|
| 17 |
+
repo_url: str = Field(..., description="Repository URL or fixture reference")
|
| 18 |
+
test_name: str = Field(..., description="Pytest test identifier")
|
| 19 |
+
test_code: str = Field(..., description="Test source snippet")
|
| 20 |
+
file_tree: list[str] = Field(default_factory=list, description="Top-level file tree")
|
| 21 |
+
tool_output: str | None = Field(default=None, description="Result of the previous exploratory action")
|
| 22 |
+
task_type: TaskType = Field(..., description="Current task type")
|
| 23 |
+
task_description: str = Field(..., description="Instruction for the agent")
|
| 24 |
+
step_count: int = Field(default=0, description="Current episode step count")
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class FlakySleuthAction(Action):
|
| 28 |
+
action_type: Literal[
|
| 29 |
+
"read_file",
|
| 30 |
+
"search_code",
|
| 31 |
+
"run_test",
|
| 32 |
+
"classify_flakiness",
|
| 33 |
+
"classify_root_cause",
|
| 34 |
+
"propose_fix",
|
| 35 |
+
] = Field(..., description="Action to execute")
|
| 36 |
+
argument: str = Field(default="", description="Action argument")
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
class FlakySleuthReward(BaseModel):
|
| 40 |
+
score: float
|
| 41 |
+
breakdown: dict[str, Any]
|
| 42 |
+
explanation: str
|
env/sandbox.py
ADDED
|
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import shutil
|
| 5 |
+
import subprocess
|
| 6 |
+
import tempfile
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class Sandbox:
|
| 11 |
+
def __init__(self, task: dict):
|
| 12 |
+
self.task = task
|
| 13 |
+
self.tmpdir: str | None = None
|
| 14 |
+
self.file_tree: list[str] = []
|
| 15 |
+
|
| 16 |
+
def setup(self) -> None:
|
| 17 |
+
"""Prepare a working copy of the repository for the episode."""
|
| 18 |
+
self.tmpdir = tempfile.mkdtemp(prefix="flakysleuth_")
|
| 19 |
+
repo_url = str(self.task.get("repo_url", "")).strip()
|
| 20 |
+
sha = str(self.task.get("sha", "")).strip()
|
| 21 |
+
|
| 22 |
+
try:
|
| 23 |
+
if repo_url.startswith("fixture://"):
|
| 24 |
+
self._copy_fixture_repo(repo_url)
|
| 25 |
+
else:
|
| 26 |
+
self._clone_repo(repo_url, sha)
|
| 27 |
+
|
| 28 |
+
self.file_tree = self._build_file_tree()
|
| 29 |
+
except Exception as exc:
|
| 30 |
+
self.cleanup()
|
| 31 |
+
raise RuntimeError(f"Sandbox setup failed: {exc}") from exc
|
| 32 |
+
|
| 33 |
+
def read_file(self, relative_path: str) -> str | None:
|
| 34 |
+
"""Read a file relative to sandbox root. Returns None when not found/unsafe."""
|
| 35 |
+
if not self.tmpdir:
|
| 36 |
+
return None
|
| 37 |
+
|
| 38 |
+
root = os.path.abspath(self.tmpdir)
|
| 39 |
+
full_path = os.path.abspath(os.path.join(root, relative_path))
|
| 40 |
+
|
| 41 |
+
# Path traversal guard.
|
| 42 |
+
if os.path.commonpath([root, full_path]) != root:
|
| 43 |
+
return None
|
| 44 |
+
if not os.path.isfile(full_path):
|
| 45 |
+
return None
|
| 46 |
+
|
| 47 |
+
try:
|
| 48 |
+
with open(full_path, "r", encoding="utf-8", errors="replace") as handle:
|
| 49 |
+
return handle.read()[:4000]
|
| 50 |
+
except Exception:
|
| 51 |
+
return None
|
| 52 |
+
|
| 53 |
+
def grep(self, pattern: str) -> str:
|
| 54 |
+
"""Search .py files in repo, preferring ripgrep and falling back to grep."""
|
| 55 |
+
if not self.tmpdir:
|
| 56 |
+
return "ERROR: Sandbox not initialized"
|
| 57 |
+
|
| 58 |
+
rg_cmd = ["rg", "-n", "--glob", "*.py", pattern, "."]
|
| 59 |
+
grep_cmd = ["grep", "-RIn", "--include=*.py", pattern, "."]
|
| 60 |
+
|
| 61 |
+
try:
|
| 62 |
+
result = subprocess.run(
|
| 63 |
+
rg_cmd,
|
| 64 |
+
cwd=self.tmpdir,
|
| 65 |
+
capture_output=True,
|
| 66 |
+
text=True,
|
| 67 |
+
timeout=10,
|
| 68 |
+
)
|
| 69 |
+
except FileNotFoundError:
|
| 70 |
+
# ripgrep not installed in runtime; fall back to POSIX grep.
|
| 71 |
+
try:
|
| 72 |
+
result = subprocess.run(
|
| 73 |
+
grep_cmd,
|
| 74 |
+
cwd=self.tmpdir,
|
| 75 |
+
capture_output=True,
|
| 76 |
+
text=True,
|
| 77 |
+
timeout=10,
|
| 78 |
+
)
|
| 79 |
+
except FileNotFoundError:
|
| 80 |
+
return (
|
| 81 |
+
"Search error: neither 'rg' (ripgrep) nor 'grep' is installed in the "
|
| 82 |
+
"runtime."
|
| 83 |
+
)
|
| 84 |
+
except subprocess.TimeoutExpired:
|
| 85 |
+
return "Search timed out"
|
| 86 |
+
except Exception as exc:
|
| 87 |
+
return f"Search error: {exc}"
|
| 88 |
+
except subprocess.TimeoutExpired:
|
| 89 |
+
return "Search timed out"
|
| 90 |
+
except Exception as exc:
|
| 91 |
+
return f"Search error: {exc}"
|
| 92 |
+
|
| 93 |
+
try:
|
| 94 |
+
output = (result.stdout + result.stderr).strip()[:2000]
|
| 95 |
+
if output:
|
| 96 |
+
return output
|
| 97 |
+
return f"No matches found for: {pattern}"
|
| 98 |
+
except Exception as exc:
|
| 99 |
+
return f"Search error: {exc}"
|
| 100 |
+
|
| 101 |
+
def run_test(self, pytest_test_name: str) -> str:
|
| 102 |
+
"""Run a test for non-order-dependent categories."""
|
| 103 |
+
if not self.tmpdir:
|
| 104 |
+
return "ERROR: Sandbox not initialized"
|
| 105 |
+
|
| 106 |
+
category = str(self.task.get("category", "")).strip()
|
| 107 |
+
if category in ("OD", "OD-Brit", "OD-Vic"):
|
| 108 |
+
return (
|
| 109 |
+
"Test execution skipped for order-dependent tests. "
|
| 110 |
+
"Use read_file and search_code for static analysis. "
|
| 111 |
+
"Look for shared state, missing cleanup, or global mutations."
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
try:
|
| 115 |
+
result = subprocess.run(
|
| 116 |
+
[
|
| 117 |
+
"python",
|
| 118 |
+
"-m",
|
| 119 |
+
"pytest",
|
| 120 |
+
pytest_test_name,
|
| 121 |
+
"--tb=short",
|
| 122 |
+
"-x",
|
| 123 |
+
"--timeout=30",
|
| 124 |
+
"-q",
|
| 125 |
+
],
|
| 126 |
+
cwd=self.tmpdir,
|
| 127 |
+
capture_output=True,
|
| 128 |
+
text=True,
|
| 129 |
+
timeout=60,
|
| 130 |
+
)
|
| 131 |
+
output = (result.stdout + result.stderr).strip()[:2000]
|
| 132 |
+
return output or "Test completed with no output"
|
| 133 |
+
except subprocess.TimeoutExpired:
|
| 134 |
+
return "Test execution timed out (>60s)"
|
| 135 |
+
except Exception as exc:
|
| 136 |
+
return f"Test execution error: {exc}"
|
| 137 |
+
|
| 138 |
+
def cleanup(self) -> None:
|
| 139 |
+
if self.tmpdir and os.path.exists(self.tmpdir):
|
| 140 |
+
shutil.rmtree(self.tmpdir, ignore_errors=True)
|
| 141 |
+
self.tmpdir = None
|
| 142 |
+
self.file_tree = []
|
| 143 |
+
|
| 144 |
+
def _clone_repo(self, repo_url: str, sha: str) -> None:
|
| 145 |
+
if not repo_url:
|
| 146 |
+
raise ValueError("Missing repo_url")
|
| 147 |
+
assert self.tmpdir is not None
|
| 148 |
+
|
| 149 |
+
sha = (sha or "").strip()
|
| 150 |
+
# Robust path: fetch the exact commit directly (works even when not in shallow branch history).
|
| 151 |
+
if sha and sha.lower() != "nan":
|
| 152 |
+
init = subprocess.run(
|
| 153 |
+
["git", "init", self.tmpdir],
|
| 154 |
+
capture_output=True,
|
| 155 |
+
text=True,
|
| 156 |
+
timeout=20,
|
| 157 |
+
)
|
| 158 |
+
if init.returncode != 0:
|
| 159 |
+
raise RuntimeError(f"git init failed: {init.stderr.strip()}")
|
| 160 |
+
|
| 161 |
+
remote = subprocess.run(
|
| 162 |
+
["git", "-C", self.tmpdir, "remote", "add", "origin", repo_url],
|
| 163 |
+
capture_output=True,
|
| 164 |
+
text=True,
|
| 165 |
+
timeout=15,
|
| 166 |
+
)
|
| 167 |
+
if remote.returncode != 0:
|
| 168 |
+
raise RuntimeError(f"git remote add failed: {remote.stderr.strip()}")
|
| 169 |
+
|
| 170 |
+
fetch = subprocess.run(
|
| 171 |
+
["git", "-C", self.tmpdir, "fetch", "--depth=1", "origin", sha],
|
| 172 |
+
capture_output=True,
|
| 173 |
+
text=True,
|
| 174 |
+
timeout=120,
|
| 175 |
+
)
|
| 176 |
+
if fetch.returncode != 0:
|
| 177 |
+
raise RuntimeError(
|
| 178 |
+
"git fetch exact sha failed: "
|
| 179 |
+
+ (fetch.stderr.strip() or fetch.stdout.strip())
|
| 180 |
+
)
|
| 181 |
+
|
| 182 |
+
checkout = subprocess.run(
|
| 183 |
+
["git", "-C", self.tmpdir, "checkout", "--detach", "FETCH_HEAD"],
|
| 184 |
+
capture_output=True,
|
| 185 |
+
text=True,
|
| 186 |
+
timeout=30,
|
| 187 |
+
)
|
| 188 |
+
if checkout.returncode != 0:
|
| 189 |
+
raise RuntimeError(
|
| 190 |
+
"git checkout fetched sha failed: "
|
| 191 |
+
+ (checkout.stderr.strip() or checkout.stdout.strip())
|
| 192 |
+
)
|
| 193 |
+
return
|
| 194 |
+
|
| 195 |
+
# Fallback for rows without a SHA.
|
| 196 |
+
clone = subprocess.run(
|
| 197 |
+
["git", "clone", "--depth=50", repo_url, self.tmpdir],
|
| 198 |
+
capture_output=True,
|
| 199 |
+
text=True,
|
| 200 |
+
timeout=120,
|
| 201 |
+
)
|
| 202 |
+
if clone.returncode != 0:
|
| 203 |
+
raise RuntimeError(
|
| 204 |
+
"git clone failed: " + (clone.stderr.strip() or clone.stdout.strip())
|
| 205 |
+
)
|
| 206 |
+
|
| 207 |
+
def _copy_fixture_repo(self, repo_url: str) -> None:
|
| 208 |
+
fixture_name = repo_url.replace("fixture://", "", 1).strip("/")
|
| 209 |
+
if not fixture_name:
|
| 210 |
+
raise ValueError("Fixture name missing in repo_url")
|
| 211 |
+
|
| 212 |
+
fixture_dir = (
|
| 213 |
+
Path(__file__).resolve().parent.parent
|
| 214 |
+
/ "dataset"
|
| 215 |
+
/ "fixtures"
|
| 216 |
+
/ fixture_name
|
| 217 |
+
)
|
| 218 |
+
if not fixture_dir.exists():
|
| 219 |
+
raise FileNotFoundError(f"Fixture repo not found: {fixture_dir}")
|
| 220 |
+
|
| 221 |
+
assert self.tmpdir is not None
|
| 222 |
+
shutil.copytree(fixture_dir, self.tmpdir, dirs_exist_ok=True)
|
| 223 |
+
|
| 224 |
+
def _build_file_tree(self) -> list[str]:
|
| 225 |
+
assert self.tmpdir is not None
|
| 226 |
+
result: list[str] = []
|
| 227 |
+
for root, dirs, files in os.walk(self.tmpdir):
|
| 228 |
+
dirs[:] = [
|
| 229 |
+
d
|
| 230 |
+
for d in dirs
|
| 231 |
+
if not d.startswith(".")
|
| 232 |
+
and d not in ("node_modules", "__pycache__", ".git", "venv", ".tox")
|
| 233 |
+
]
|
| 234 |
+
depth = root.replace(self.tmpdir, "").count(os.sep)
|
| 235 |
+
if depth <= 2:
|
| 236 |
+
for file_name in files:
|
| 237 |
+
rel_path = os.path.relpath(os.path.join(root, file_name), self.tmpdir)
|
| 238 |
+
result.append(rel_path)
|
| 239 |
+
if len(result) > 100:
|
| 240 |
+
break
|
| 241 |
+
return result[:100]
|
env/task_loader.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import csv
|
| 4 |
+
import random
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Any
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class TaskLoader:
|
| 10 |
+
def __init__(self, csv_path: str):
|
| 11 |
+
path = Path(csv_path)
|
| 12 |
+
if not path.exists():
|
| 13 |
+
raise FileNotFoundError(f"Task CSV not found: {csv_path}")
|
| 14 |
+
|
| 15 |
+
self.tasks: list[dict[str, Any]] = []
|
| 16 |
+
with path.open("r", encoding="utf-8", newline="") as handle:
|
| 17 |
+
reader = csv.DictReader(handle)
|
| 18 |
+
for row in reader:
|
| 19 |
+
task_types = str(row.get("task_types", "")).split(";")
|
| 20 |
+
for raw_type in task_types:
|
| 21 |
+
task_type = raw_type.strip()
|
| 22 |
+
if not task_type:
|
| 23 |
+
continue
|
| 24 |
+
entry = dict(row)
|
| 25 |
+
entry["task_type"] = task_type
|
| 26 |
+
self.tasks.append(entry)
|
| 27 |
+
|
| 28 |
+
if not self.tasks:
|
| 29 |
+
raise ValueError(f"No tasks loaded from {csv_path}")
|
| 30 |
+
|
| 31 |
+
self._forced_type: str | None = None
|
| 32 |
+
|
| 33 |
+
def sample(self) -> dict[str, Any]:
|
| 34 |
+
pool = self.tasks
|
| 35 |
+
if self._forced_type:
|
| 36 |
+
pool = [task for task in self.tasks if task["task_type"] == self._forced_type]
|
| 37 |
+
if not pool:
|
| 38 |
+
raise ValueError(f"No tasks available for task type: {self._forced_type}")
|
| 39 |
+
|
| 40 |
+
task = random.choice(pool).copy()
|
| 41 |
+
task["task_description"] = self._make_description(task)
|
| 42 |
+
return task
|
| 43 |
+
|
| 44 |
+
def force_task_type(self, task_type: str | None) -> None:
|
| 45 |
+
self._forced_type = task_type
|
| 46 |
+
|
| 47 |
+
def _make_description(self, task: dict[str, Any]) -> str:
|
| 48 |
+
task_type = task["task_type"]
|
| 49 |
+
if task_type == "classify":
|
| 50 |
+
return (
|
| 51 |
+
"Investigate the given test and determine whether it is FLAKY or STABLE. "
|
| 52 |
+
"Use read_file and search_code to gather evidence. "
|
| 53 |
+
"When confident, call classify_flakiness with argument 'flaky' or 'stable'."
|
| 54 |
+
)
|
| 55 |
+
if task_type == "root_cause":
|
| 56 |
+
return (
|
| 57 |
+
"This test is confirmed flaky. Identify its root cause category. "
|
| 58 |
+
"Valid categories: OD, OD-Brit, OD-Vic, NIO, NOD, TD, TZD, ID, NDOI. "
|
| 59 |
+
"Use read_file and search_code to find evidence. "
|
| 60 |
+
"Call classify_root_cause with the category code when confident."
|
| 61 |
+
)
|
| 62 |
+
if task_type == "fix_proposal":
|
| 63 |
+
return (
|
| 64 |
+
f"This test is confirmed flaky with root cause: {task.get('category', 'unknown')}. "
|
| 65 |
+
"Propose a concrete fix as a unified diff. "
|
| 66 |
+
"Use read_file and search_code to understand the code. "
|
| 67 |
+
"Call propose_fix with a valid unified diff string."
|
| 68 |
+
)
|
| 69 |
+
return "Investigate the flaky test."
|
flakysleuth_build_plan.md
ADDED
|
@@ -0,0 +1,1236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# FlakySleuth — Comprehensive Round 1 Build Plan
|
| 2 |
+
## Meta × PyTorch × Scaler OpenEnv Hackathon
|
| 3 |
+
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
## 0. What You Are Building (One Paragraph for Clarity)
|
| 7 |
+
|
| 8 |
+
You are building an **OpenEnv-compliant RL environment** called `FlakySleuthEnv`. It simulates a real software engineering task: investigating flaky tests in real Python GitHub repositories. An LLM agent is dropped into a sandboxed repo at a specific commit, given a test that is known to be flaky (sourced from the IDoFT dataset), and must use tool calls (read files, grep code, run tests) to investigate and produce a verdict. The environment scores the agent's verdict using deterministic graders (Tasks 1 and 2) and a hybrid programmatic + LLM judge grader (Task 3). You are NOT training any model. The submitted artifact is the environment itself — its graders, reward logic, OpenEnv spec compliance, Docker container, and a baseline `inference.py` script that proves it works.
|
| 9 |
+
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
## 1. Repository Structure
|
| 13 |
+
|
| 14 |
+
```
|
| 15 |
+
flaky-sleuth-env/
|
| 16 |
+
│
|
| 17 |
+
├── inference.py ← REQUIRED: must be named exactly this, in root
|
| 18 |
+
├── openenv.yaml ← REQUIRED: OpenEnv spec metadata
|
| 19 |
+
├── Dockerfile ← REQUIRED: must build and run
|
| 20 |
+
├── requirements.txt
|
| 21 |
+
├── README.md
|
| 22 |
+
│
|
| 23 |
+
├── server.py ← FastAPI HTTP server (OpenEnv endpoints)
|
| 24 |
+
│
|
| 25 |
+
├── env/
|
| 26 |
+
│ ├── __init__.py
|
| 27 |
+
│ ├── models.py ← All Pydantic models (Observation, Action, Reward)
|
| 28 |
+
│ ├── environment.py ← FlakySleuthEnv core class
|
| 29 |
+
│ ├── sandbox.py ← Git clone, file read, grep, run_test
|
| 30 |
+
│ └── task_loader.py ← Loads tasks from dataset CSV
|
| 31 |
+
│
|
| 32 |
+
├── graders/
|
| 33 |
+
│ ├── __init__.py ← grade_action() dispatcher
|
| 34 |
+
│ ├── task1_grader.py ← Binary flaky/stable
|
| 35 |
+
│ ├── task2_grader.py ← Root cause category + similarity matrix
|
| 36 |
+
│ └── task3_grader.py ← Fix proposal: pattern + diff + LLM judge
|
| 37 |
+
│
|
| 38 |
+
├── dataset/
|
| 39 |
+
│ ├── build_dataset.py ← OFFLINE SCRIPT: preprocess IDoFT → py_tasks.csv
|
| 40 |
+
│ ├── py_tasks.csv ← Final preprocessed task bank (committed to repo)
|
| 41 |
+
│ └── category_similarity.json ← Similarity matrix for Task 2 partial credit
|
| 42 |
+
│
|
| 43 |
+
└── tests/
|
| 44 |
+
└── test_compliance.py ← openenv validate compliance checks
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
---
|
| 48 |
+
|
| 49 |
+
## 2. Data Pipeline (Do This First, Offline)
|
| 50 |
+
|
| 51 |
+
### 2.1 Download the Raw Dataset
|
| 52 |
+
|
| 53 |
+
```bash
|
| 54 |
+
git clone https://github.com/TestingResearchIllinois/idoft
|
| 55 |
+
# The file you need:
|
| 56 |
+
# idoft/py-data.csv
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
### 2.2 Understand the CSV Columns
|
| 60 |
+
|
| 61 |
+
The `py-data.csv` has these columns:
|
| 62 |
+
```
|
| 63 |
+
Project URL | SHA Detected | Pytest Test Name | Category | Status | PR Link | Notes
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
- **Project URL**: GitHub repo to clone
|
| 67 |
+
- **SHA Detected**: Exact commit to clone at (this is where the test IS flaky)
|
| 68 |
+
- **Pytest Test Name**: Format is `path/to/test_file.py::TestClass::test_method` or `path/to/test_file.py::test_method`
|
| 69 |
+
- **Category**: One of OD, OD-Brit, OD-Vic, NIO, NOD, UD, TD, TZD, ID, NDOI, NDOD, OSD (may be semicolon-separated for multiple)
|
| 70 |
+
- **Status**: Blank, Opened, Accepted, Rejected, etc.
|
| 71 |
+
- **PR Link**: Format `owner/repo#number` — only present when Status is Opened/Accepted
|
| 72 |
+
|
| 73 |
+
### 2.3 Filter Rules Per Task
|
| 74 |
+
|
| 75 |
+
```python
|
| 76 |
+
# Task 1 (classify): Use these categories — they have clear static signals
|
| 77 |
+
TASK1_CATEGORIES = ["NOD", "TD", "TZD", "NIO", "ID", "OD", "OD-Brit", "OD-Vic"]
|
| 78 |
+
|
| 79 |
+
# Task 2 (root cause): Same categories — agent must identify which one
|
| 80 |
+
TASK2_CATEGORIES = ["NOD", "TD", "TZD", "NIO", "ID", "OD", "OD-Brit", "OD-Vic"]
|
| 81 |
+
# Exclude "UD" (unknown — no ground truth to grade against)
|
| 82 |
+
|
| 83 |
+
# Task 3 (fix proposal): ONLY rows where a fix was accepted AND category is gradeable
|
| 84 |
+
TASK3_CATEGORIES = ["TD", "TZD", "NOD", "NIO", "ID"]
|
| 85 |
+
# Exclude: OD, OD-Brit, OD-Vic (cannot verify fix without multi-order execution)
|
| 86 |
+
# Exclude: UD (unknown cause = cannot score fix)
|
| 87 |
+
# Require: Status == "Accepted" AND PR Link is not empty
|
| 88 |
+
```
|
| 89 |
+
|
| 90 |
+
### 2.4 Build `py_tasks.csv` (the `build_dataset.py` script)
|
| 91 |
+
|
| 92 |
+
This script runs ONCE offline. It:
|
| 93 |
+
1. Reads `idoft/py-data.csv`
|
| 94 |
+
2. For each row, fetches the test source code by cloning the repo at SHA (or using GitHub raw API)
|
| 95 |
+
3. For Task 3 rows (Status=Accepted), fetches the PR diff from GitHub API
|
| 96 |
+
4. Outputs `dataset/py_tasks.csv`
|
| 97 |
+
|
| 98 |
+
```python
|
| 99 |
+
# dataset/build_dataset.py
|
| 100 |
+
|
| 101 |
+
import pandas as pd
|
| 102 |
+
import requests
|
| 103 |
+
import subprocess
|
| 104 |
+
import tempfile
|
| 105 |
+
import os
|
| 106 |
+
|
| 107 |
+
GITHUB_TOKEN = os.environ["GITHUB_TOKEN"] # set this before running
|
| 108 |
+
|
| 109 |
+
def fetch_test_code(repo_url: str, sha: str, pytest_test_name: str) -> str:
|
| 110 |
+
"""
|
| 111 |
+
Clone repo at SHA, extract the test function source code.
|
| 112 |
+
pytest_test_name format: path/to/test.py::TestClass::test_method
|
| 113 |
+
"""
|
| 114 |
+
test_file = pytest_test_name.split("::")[0]
|
| 115 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 116 |
+
subprocess.run([
|
| 117 |
+
"git", "clone", "--depth=1", repo_url, tmpdir
|
| 118 |
+
], capture_output=True)
|
| 119 |
+
subprocess.run([
|
| 120 |
+
"git", "checkout", sha
|
| 121 |
+
], cwd=tmpdir, capture_output=True)
|
| 122 |
+
filepath = os.path.join(tmpdir, test_file)
|
| 123 |
+
if not os.path.exists(filepath):
|
| 124 |
+
return ""
|
| 125 |
+
with open(filepath) as f:
|
| 126 |
+
return f.read()[:5000] # cap at 5000 chars
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def fetch_pr_diff(pr_link: str) -> str:
|
| 130 |
+
"""
|
| 131 |
+
pr_link format: "owner/repo#number"
|
| 132 |
+
Returns unified diff string of the PR.
|
| 133 |
+
"""
|
| 134 |
+
if not pr_link or "#" not in pr_link:
|
| 135 |
+
return ""
|
| 136 |
+
repo, number = pr_link.strip().split("#")
|
| 137 |
+
url = f"https://api.github.com/repos/{repo}/pulls/{number}"
|
| 138 |
+
headers = {
|
| 139 |
+
"Authorization": f"token {GITHUB_TOKEN}",
|
| 140 |
+
"Accept": "application/vnd.github.diff"
|
| 141 |
+
}
|
| 142 |
+
resp = requests.get(url, headers=headers, timeout=10)
|
| 143 |
+
if resp.status_code == 200:
|
| 144 |
+
return resp.text[:3000] # cap diff size
|
| 145 |
+
return ""
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def build():
|
| 149 |
+
df = pd.read_csv("idoft/py-data.csv")
|
| 150 |
+
|
| 151 |
+
# Rename columns for clarity
|
| 152 |
+
df.columns = [c.strip() for c in df.columns]
|
| 153 |
+
|
| 154 |
+
rows = []
|
| 155 |
+
for _, row in df.iterrows():
|
| 156 |
+
repo_url = str(row.get("Project URL", "")).strip()
|
| 157 |
+
sha = str(row.get("SHA Detected", "")).strip()
|
| 158 |
+
test_name = str(row.get("Pytest Test Name", "")).strip()
|
| 159 |
+
category_raw = str(row.get("Category", "")).strip()
|
| 160 |
+
status = str(row.get("Status", "")).strip()
|
| 161 |
+
pr_link = str(row.get("PR Link", "")).strip()
|
| 162 |
+
|
| 163 |
+
# Skip rows with missing essentials
|
| 164 |
+
if not repo_url or not sha or not test_name or not category_raw:
|
| 165 |
+
continue
|
| 166 |
+
|
| 167 |
+
# Take primary category (first if semicolon-separated)
|
| 168 |
+
category = category_raw.split(";")[0].strip()
|
| 169 |
+
|
| 170 |
+
# Skip UD for Task 2 (no ground truth)
|
| 171 |
+
if category == "UD":
|
| 172 |
+
continue
|
| 173 |
+
|
| 174 |
+
# Determine task types this row is eligible for
|
| 175 |
+
task_types = []
|
| 176 |
+
if category in ["NOD", "TD", "TZD", "NIO", "ID", "OD", "OD-Brit", "OD-Vic"]:
|
| 177 |
+
task_types.append("classify")
|
| 178 |
+
task_types.append("root_cause")
|
| 179 |
+
if (category in ["TD", "TZD", "NOD", "NIO", "ID"]
|
| 180 |
+
and status == "Accepted"
|
| 181 |
+
and pr_link and pr_link != "nan"):
|
| 182 |
+
task_types.append("fix_proposal")
|
| 183 |
+
|
| 184 |
+
if not task_types:
|
| 185 |
+
continue
|
| 186 |
+
|
| 187 |
+
# Fetch test source code
|
| 188 |
+
test_code = fetch_test_code(repo_url, sha, test_name)
|
| 189 |
+
if not test_code:
|
| 190 |
+
continue
|
| 191 |
+
|
| 192 |
+
# Fetch fix diff for Task 3 eligible rows
|
| 193 |
+
known_fix_diff = ""
|
| 194 |
+
if "fix_proposal" in task_types:
|
| 195 |
+
known_fix_diff = fetch_pr_diff(pr_link)
|
| 196 |
+
|
| 197 |
+
rows.append({
|
| 198 |
+
"repo_url": repo_url,
|
| 199 |
+
"sha": sha,
|
| 200 |
+
"test_name": test_name,
|
| 201 |
+
"test_file": test_name.split("::")[0],
|
| 202 |
+
"category": category,
|
| 203 |
+
"status": status,
|
| 204 |
+
"pr_link": pr_link,
|
| 205 |
+
"task_types": ";".join(task_types),
|
| 206 |
+
"test_code": test_code,
|
| 207 |
+
"known_fix_diff": known_fix_diff,
|
| 208 |
+
})
|
| 209 |
+
|
| 210 |
+
out = pd.DataFrame(rows)
|
| 211 |
+
out.to_csv("dataset/py_tasks.csv", index=False)
|
| 212 |
+
print(f"Built {len(out)} task rows")
|
| 213 |
+
print(out["category"].value_counts())
|
| 214 |
+
print(out["task_types"].value_counts())
|
| 215 |
+
|
| 216 |
+
if __name__ == "__main__":
|
| 217 |
+
build()
|
| 218 |
+
```
|
| 219 |
+
|
| 220 |
+
### 2.5 Build `category_similarity.json`
|
| 221 |
+
|
| 222 |
+
```json
|
| 223 |
+
{
|
| 224 |
+
"OD,OD-Brit": 0.7,
|
| 225 |
+
"OD,OD-Vic": 0.7,
|
| 226 |
+
"OD-Brit,OD-Vic": 0.8,
|
| 227 |
+
"OD,NIO": 0.4,
|
| 228 |
+
"OD,NDOI": 0.3,
|
| 229 |
+
"NOD,TD": 0.6,
|
| 230 |
+
"NOD,TZD": 0.5,
|
| 231 |
+
"NOD,NDOI": 0.5,
|
| 232 |
+
"TD,TZD": 0.7,
|
| 233 |
+
"NOD,ID": 0.3,
|
| 234 |
+
"UD,OD": 0.2,
|
| 235 |
+
"UD,NOD": 0.2,
|
| 236 |
+
"UD,NIO": 0.2,
|
| 237 |
+
"UD,TD": 0.2,
|
| 238 |
+
"UD,ID": 0.2
|
| 239 |
+
}
|
| 240 |
+
```
|
| 241 |
+
|
| 242 |
+
---
|
| 243 |
+
|
| 244 |
+
## 3. Pydantic Models (`env/models.py`)
|
| 245 |
+
|
| 246 |
+
```python
|
| 247 |
+
from pydantic import BaseModel
|
| 248 |
+
from typing import Literal, Optional, List
|
| 249 |
+
|
| 250 |
+
class FlakySleuthObservation(BaseModel):
|
| 251 |
+
repo_url: str
|
| 252 |
+
test_name: str
|
| 253 |
+
test_code: str
|
| 254 |
+
file_tree: List[str]
|
| 255 |
+
tool_output: Optional[str] = None
|
| 256 |
+
task_type: Literal["classify", "root_cause", "fix_proposal"]
|
| 257 |
+
task_description: str
|
| 258 |
+
step_count: int
|
| 259 |
+
|
| 260 |
+
class FlakySleuthAction(BaseModel):
|
| 261 |
+
action_type: Literal[
|
| 262 |
+
"read_file",
|
| 263 |
+
"search_code",
|
| 264 |
+
"run_test",
|
| 265 |
+
"classify_flakiness",
|
| 266 |
+
"classify_root_cause",
|
| 267 |
+
"propose_fix",
|
| 268 |
+
]
|
| 269 |
+
argument: str
|
| 270 |
+
|
| 271 |
+
class FlakySleuthReward(BaseModel):
|
| 272 |
+
score: float
|
| 273 |
+
breakdown: dict
|
| 274 |
+
explanation: str
|
| 275 |
+
```
|
| 276 |
+
|
| 277 |
+
---
|
| 278 |
+
|
| 279 |
+
## 4. Sandbox (`env/sandbox.py`)
|
| 280 |
+
|
| 281 |
+
The sandbox wraps a cloned git repo. It handles all filesystem operations.
|
| 282 |
+
|
| 283 |
+
```python
|
| 284 |
+
import subprocess
|
| 285 |
+
import tempfile
|
| 286 |
+
import os
|
| 287 |
+
import shutil
|
| 288 |
+
from typing import Optional, List
|
| 289 |
+
|
| 290 |
+
class Sandbox:
|
| 291 |
+
def __init__(self, task: dict):
|
| 292 |
+
self.task = task
|
| 293 |
+
self.tmpdir: Optional[str] = None
|
| 294 |
+
self.file_tree: List[str] = []
|
| 295 |
+
|
| 296 |
+
def setup(self):
|
| 297 |
+
"""Clone repo at the specific SHA. Called by env.reset()."""
|
| 298 |
+
self.tmpdir = tempfile.mkdtemp(prefix="flakysleuth_")
|
| 299 |
+
try:
|
| 300 |
+
# Shallow clone for speed
|
| 301 |
+
subprocess.run([
|
| 302 |
+
"git", "clone", "--depth=50",
|
| 303 |
+
self.task["repo_url"],
|
| 304 |
+
self.tmpdir
|
| 305 |
+
], capture_output=True, timeout=60, check=True)
|
| 306 |
+
|
| 307 |
+
# Checkout exact SHA where flakiness was detected
|
| 308 |
+
subprocess.run([
|
| 309 |
+
"git", "checkout", self.task["sha"]
|
| 310 |
+
], cwd=self.tmpdir, capture_output=True, timeout=30, check=True)
|
| 311 |
+
|
| 312 |
+
self.file_tree = self._build_file_tree()
|
| 313 |
+
except Exception as e:
|
| 314 |
+
self.cleanup()
|
| 315 |
+
raise RuntimeError(f"Sandbox setup failed: {e}")
|
| 316 |
+
|
| 317 |
+
def read_file(self, relative_path: str) -> Optional[str]:
|
| 318 |
+
"""Read a file relative to repo root. Returns None if not found."""
|
| 319 |
+
full_path = os.path.normpath(os.path.join(self.tmpdir, relative_path))
|
| 320 |
+
# Security: ensure path stays inside tmpdir
|
| 321 |
+
if not full_path.startswith(self.tmpdir):
|
| 322 |
+
return None
|
| 323 |
+
if not os.path.isfile(full_path):
|
| 324 |
+
return None
|
| 325 |
+
try:
|
| 326 |
+
with open(full_path, "r", errors="replace") as f:
|
| 327 |
+
return f.read()[:4000] # cap to avoid huge files
|
| 328 |
+
except Exception:
|
| 329 |
+
return None
|
| 330 |
+
|
| 331 |
+
def grep(self, pattern: str) -> str:
|
| 332 |
+
"""Grep for pattern across all .py files in the repo."""
|
| 333 |
+
if not self.tmpdir:
|
| 334 |
+
return "ERROR: Sandbox not initialized"
|
| 335 |
+
try:
|
| 336 |
+
result = subprocess.run(
|
| 337 |
+
["grep", "-rn", "--include=*.py", pattern, "."],
|
| 338 |
+
cwd=self.tmpdir,
|
| 339 |
+
capture_output=True,
|
| 340 |
+
text=True,
|
| 341 |
+
timeout=10
|
| 342 |
+
)
|
| 343 |
+
output = result.stdout[:2000]
|
| 344 |
+
return output if output else f"No matches found for: {pattern}"
|
| 345 |
+
except subprocess.TimeoutExpired:
|
| 346 |
+
return "Search timed out"
|
| 347 |
+
except Exception as e:
|
| 348 |
+
return f"Search error: {e}"
|
| 349 |
+
|
| 350 |
+
def run_test(self, pytest_test_name: str) -> str:
|
| 351 |
+
"""
|
| 352 |
+
Run the specific test via pytest.
|
| 353 |
+
ONLY called for non-OD tasks.
|
| 354 |
+
"""
|
| 355 |
+
if self.task["category"] in ("OD", "OD-Brit", "OD-Vic"):
|
| 356 |
+
return (
|
| 357 |
+
"Test execution skipped for order-dependent tests. "
|
| 358 |
+
"Use read_file and search_code to analyze static code structure instead. "
|
| 359 |
+
"Look for: shared state, missing setUp/tearDown, module-scoped fixtures, global mutations."
|
| 360 |
+
)
|
| 361 |
+
try:
|
| 362 |
+
result = subprocess.run(
|
| 363 |
+
["python", "-m", "pytest", pytest_test_name,
|
| 364 |
+
"--tb=short", "-x", "--timeout=30", "-q"],
|
| 365 |
+
cwd=self.tmpdir,
|
| 366 |
+
capture_output=True,
|
| 367 |
+
text=True,
|
| 368 |
+
timeout=60
|
| 369 |
+
)
|
| 370 |
+
output = (result.stdout + result.stderr)[:2000]
|
| 371 |
+
return output if output else "Test completed with no output"
|
| 372 |
+
except subprocess.TimeoutExpired:
|
| 373 |
+
return "Test execution timed out (>60s)"
|
| 374 |
+
except Exception as e:
|
| 375 |
+
return f"Test execution error: {e}"
|
| 376 |
+
|
| 377 |
+
def cleanup(self):
|
| 378 |
+
"""Remove temp directory. Called after episode ends."""
|
| 379 |
+
if self.tmpdir and os.path.exists(self.tmpdir):
|
| 380 |
+
shutil.rmtree(self.tmpdir, ignore_errors=True)
|
| 381 |
+
self.tmpdir = None
|
| 382 |
+
self.file_tree = []
|
| 383 |
+
|
| 384 |
+
def _build_file_tree(self) -> List[str]:
|
| 385 |
+
"""Return top-2-level file paths relative to repo root."""
|
| 386 |
+
result = []
|
| 387 |
+
for root, dirs, files in os.walk(self.tmpdir):
|
| 388 |
+
# Skip hidden dirs and common noise
|
| 389 |
+
dirs[:] = [d for d in dirs if not d.startswith(".")
|
| 390 |
+
and d not in ("node_modules", "__pycache__", ".git", "venv", ".tox")]
|
| 391 |
+
depth = root.replace(self.tmpdir, "").count(os.sep)
|
| 392 |
+
if depth <= 2:
|
| 393 |
+
for f in files:
|
| 394 |
+
rel = os.path.relpath(os.path.join(root, f), self.tmpdir)
|
| 395 |
+
result.append(rel)
|
| 396 |
+
if len(result) > 100:
|
| 397 |
+
break
|
| 398 |
+
return result[:100]
|
| 399 |
+
```
|
| 400 |
+
|
| 401 |
+
---
|
| 402 |
+
|
| 403 |
+
## 5. Task Loader (`env/task_loader.py`)
|
| 404 |
+
|
| 405 |
+
```python
|
| 406 |
+
import pandas as pd
|
| 407 |
+
import random
|
| 408 |
+
from typing import Optional
|
| 409 |
+
|
| 410 |
+
class TaskLoader:
|
| 411 |
+
def __init__(self, csv_path: str):
|
| 412 |
+
df = pd.read_csv(csv_path)
|
| 413 |
+
# Expand task_types column into individual rows
|
| 414 |
+
rows = []
|
| 415 |
+
for _, row in df.iterrows():
|
| 416 |
+
for tt in str(row["task_types"]).split(";"):
|
| 417 |
+
r = row.to_dict()
|
| 418 |
+
r["task_type"] = tt.strip()
|
| 419 |
+
rows.append(r)
|
| 420 |
+
self.tasks = rows
|
| 421 |
+
self._forced_type: Optional[str] = None
|
| 422 |
+
|
| 423 |
+
def sample(self) -> dict:
|
| 424 |
+
"""Sample a random task, optionally filtered by type."""
|
| 425 |
+
pool = self.tasks
|
| 426 |
+
if self._forced_type:
|
| 427 |
+
pool = [t for t in self.tasks if t["task_type"] == self._forced_type]
|
| 428 |
+
task = random.choice(pool).copy()
|
| 429 |
+
task["task_description"] = self._make_description(task)
|
| 430 |
+
return task
|
| 431 |
+
|
| 432 |
+
def force_task_type(self, task_type: str):
|
| 433 |
+
"""Force next sample() calls to return a specific task type."""
|
| 434 |
+
self._forced_type = task_type
|
| 435 |
+
|
| 436 |
+
def _make_description(self, task: dict) -> str:
|
| 437 |
+
tt = task["task_type"]
|
| 438 |
+
if tt == "classify":
|
| 439 |
+
return (
|
| 440 |
+
"Investigate the given test and determine whether it is FLAKY or STABLE. "
|
| 441 |
+
"Use read_file and search_code to gather evidence. "
|
| 442 |
+
"When confident, call classify_flakiness with argument 'flaky' or 'stable'."
|
| 443 |
+
)
|
| 444 |
+
elif tt == "root_cause":
|
| 445 |
+
return (
|
| 446 |
+
f"This test is confirmed flaky. Identify its root cause category. "
|
| 447 |
+
f"Valid categories: OD, OD-Brit, OD-Vic, NIO, NOD, TD, TZD, ID, NDOI. "
|
| 448 |
+
f"Use read_file and search_code to find evidence. "
|
| 449 |
+
f"Call classify_root_cause with the category code when confident."
|
| 450 |
+
)
|
| 451 |
+
elif tt == "fix_proposal":
|
| 452 |
+
return (
|
| 453 |
+
f"This test is confirmed flaky with root cause: {task['category']}. "
|
| 454 |
+
f"Propose a concrete fix as a unified diff. "
|
| 455 |
+
f"Use read_file and search_code to understand the code. "
|
| 456 |
+
f"Call propose_fix with a valid unified diff string."
|
| 457 |
+
)
|
| 458 |
+
return "Investigate the flaky test."
|
| 459 |
+
```
|
| 460 |
+
|
| 461 |
+
---
|
| 462 |
+
|
| 463 |
+
## 6. Core Environment (`env/environment.py`)
|
| 464 |
+
|
| 465 |
+
```python
|
| 466 |
+
import random
|
| 467 |
+
from env.models import FlakySleuthObservation, FlakySleuthAction
|
| 468 |
+
from env.sandbox import Sandbox
|
| 469 |
+
from env.task_loader import TaskLoader
|
| 470 |
+
from graders import grade_action
|
| 471 |
+
|
| 472 |
+
FLAKY_SIGNAL_PATTERNS = [
|
| 473 |
+
"sleep", "random", "time", "datetime", "thread", "asyncio",
|
| 474 |
+
"fixture", "setUp", "tearDown", "global", "shared", "singleton",
|
| 475 |
+
"os.environ", "socket", "timeout", "retry", "mock", "patch"
|
| 476 |
+
]
|
| 477 |
+
|
| 478 |
+
class FlakySleuthEnv:
|
| 479 |
+
def __init__(self, dataset_path: str = "dataset/py_tasks.csv"):
|
| 480 |
+
self.loader = TaskLoader(dataset_path)
|
| 481 |
+
self.sandbox: Sandbox = None
|
| 482 |
+
self.current_task: dict = None
|
| 483 |
+
self.step_count: int = 0
|
| 484 |
+
self.cumulative_progress: float = 0.0
|
| 485 |
+
self.files_read: set = set()
|
| 486 |
+
self.episode_actions: list = []
|
| 487 |
+
|
| 488 |
+
def reset(self) -> FlakySleuthObservation:
|
| 489 |
+
# Cleanup previous episode
|
| 490 |
+
if self.sandbox:
|
| 491 |
+
self.sandbox.cleanup()
|
| 492 |
+
|
| 493 |
+
# Sample new task
|
| 494 |
+
self.current_task = self.loader.sample()
|
| 495 |
+
self.sandbox = Sandbox(self.current_task)
|
| 496 |
+
self.sandbox.setup()
|
| 497 |
+
|
| 498 |
+
# Reset episode state
|
| 499 |
+
self.step_count = 0
|
| 500 |
+
self.cumulative_progress = 0.0
|
| 501 |
+
self.files_read = set()
|
| 502 |
+
self.episode_actions = []
|
| 503 |
+
|
| 504 |
+
return self._make_obs()
|
| 505 |
+
|
| 506 |
+
def step(self, action: FlakySleuthAction):
|
| 507 |
+
self.step_count += 1
|
| 508 |
+
self.episode_actions.append(action)
|
| 509 |
+
tool_output = None
|
| 510 |
+
reward = 0.0
|
| 511 |
+
done = False
|
| 512 |
+
info = {}
|
| 513 |
+
|
| 514 |
+
TERMINAL_ACTIONS = ("classify_flakiness", "classify_root_cause", "propose_fix")
|
| 515 |
+
|
| 516 |
+
if action.action_type in TERMINAL_ACTIONS:
|
| 517 |
+
# Grade terminal action
|
| 518 |
+
terminal_score = grade_action(action, self.current_task)
|
| 519 |
+
|
| 520 |
+
# Late step penalty: -0.05 per step beyond 15
|
| 521 |
+
late_penalty = max(0, (self.step_count - 15)) * 0.05
|
| 522 |
+
|
| 523 |
+
# Wrong-direction penalty for T1
|
| 524 |
+
wrong_dir_penalty = 0.0
|
| 525 |
+
if (action.action_type == "classify_flakiness"
|
| 526 |
+
and action.argument.lower() == "stable"
|
| 527 |
+
and self.current_task.get("label") == "flaky"):
|
| 528 |
+
wrong_dir_penalty = 0.2
|
| 529 |
+
|
| 530 |
+
reward = min(1.0, max(0.0,
|
| 531 |
+
self.cumulative_progress + terminal_score
|
| 532 |
+
- late_penalty - wrong_dir_penalty
|
| 533 |
+
))
|
| 534 |
+
done = True
|
| 535 |
+
info = {
|
| 536 |
+
"terminal_score": terminal_score,
|
| 537 |
+
"progress_score": self.cumulative_progress,
|
| 538 |
+
"late_penalty": late_penalty,
|
| 539 |
+
"task_type": self.current_task["task_type"],
|
| 540 |
+
"category": self.current_task["category"],
|
| 541 |
+
}
|
| 542 |
+
|
| 543 |
+
else:
|
| 544 |
+
# Exploratory action
|
| 545 |
+
tool_output, progress = self._execute_exploration(action)
|
| 546 |
+
self.cumulative_progress = min(0.30, self.cumulative_progress + progress)
|
| 547 |
+
reward = progress
|
| 548 |
+
|
| 549 |
+
obs = self._make_obs(tool_output)
|
| 550 |
+
return obs, reward, done, info
|
| 551 |
+
|
| 552 |
+
def state(self) -> dict:
|
| 553 |
+
return {
|
| 554 |
+
"repo_url": self.current_task["repo_url"] if self.current_task else None,
|
| 555 |
+
"test_name": self.current_task["test_name"] if self.current_task else None,
|
| 556 |
+
"task_type": self.current_task["task_type"] if self.current_task else None,
|
| 557 |
+
"step_count": self.step_count,
|
| 558 |
+
"files_read": list(self.files_read),
|
| 559 |
+
"cumulative_progress": self.cumulative_progress,
|
| 560 |
+
}
|
| 561 |
+
|
| 562 |
+
def _execute_exploration(self, action: FlakySleuthAction):
|
| 563 |
+
progress = 0.0
|
| 564 |
+
output = ""
|
| 565 |
+
|
| 566 |
+
if action.action_type == "read_file":
|
| 567 |
+
content = self.sandbox.read_file(action.argument)
|
| 568 |
+
if content is None:
|
| 569 |
+
output = f"ERROR: File not found: {action.argument}"
|
| 570 |
+
progress = -0.05 # hallucination penalty
|
| 571 |
+
elif action.argument in self.files_read:
|
| 572 |
+
output = content
|
| 573 |
+
progress = 0.0 # no reward for re-read
|
| 574 |
+
else:
|
| 575 |
+
self.files_read.add(action.argument)
|
| 576 |
+
output = content
|
| 577 |
+
progress = self._file_relevance_reward(action.argument)
|
| 578 |
+
|
| 579 |
+
elif action.action_type == "search_code":
|
| 580 |
+
output = self.sandbox.grep(action.argument)
|
| 581 |
+
progress = self._search_relevance_reward(action.argument)
|
| 582 |
+
|
| 583 |
+
elif action.action_type == "run_test":
|
| 584 |
+
output = self.sandbox.run_test(self.current_task["test_name"])
|
| 585 |
+
# Reward for actually running the test (shows initiative)
|
| 586 |
+
# But 0 if OD task (sandbox returns static message)
|
| 587 |
+
if self.current_task["category"] not in ("OD", "OD-Brit", "OD-Vic"):
|
| 588 |
+
progress = 0.05
|
| 589 |
+
|
| 590 |
+
return output, progress
|
| 591 |
+
|
| 592 |
+
def _file_relevance_reward(self, filepath: str) -> float:
|
| 593 |
+
task = self.current_task
|
| 594 |
+
test_file = task.get("test_file", "")
|
| 595 |
+
|
| 596 |
+
if test_file and test_file in filepath:
|
| 597 |
+
return 0.07 # reading the actual test file
|
| 598 |
+
if any(filepath.endswith(ext) for ext in (".py",)):
|
| 599 |
+
return 0.03 # any python file
|
| 600 |
+
return 0.01 # non-python file (requirements, config, etc.)
|
| 601 |
+
|
| 602 |
+
def _search_relevance_reward(self, pattern: str) -> float:
|
| 603 |
+
pattern_lower = pattern.lower()
|
| 604 |
+
if any(sig in pattern_lower for sig in FLAKY_SIGNAL_PATTERNS):
|
| 605 |
+
return 0.04 # searching for known flakiness signals
|
| 606 |
+
return 0.01 # generic search
|
| 607 |
+
|
| 608 |
+
def _make_obs(self, tool_output=None) -> FlakySleuthObservation:
|
| 609 |
+
task = self.current_task
|
| 610 |
+
return FlakySleuthObservation(
|
| 611 |
+
repo_url=task["repo_url"],
|
| 612 |
+
test_name=task["test_name"],
|
| 613 |
+
test_code=task.get("test_code", "")[:2000],
|
| 614 |
+
file_tree=self.sandbox.file_tree if self.sandbox else [],
|
| 615 |
+
tool_output=tool_output,
|
| 616 |
+
task_type=task["task_type"],
|
| 617 |
+
task_description=task["task_description"],
|
| 618 |
+
step_count=self.step_count,
|
| 619 |
+
)
|
| 620 |
+
```
|
| 621 |
+
|
| 622 |
+
---
|
| 623 |
+
|
| 624 |
+
## 7. Graders
|
| 625 |
+
|
| 626 |
+
### 7.1 Dispatcher (`graders/__init__.py`)
|
| 627 |
+
|
| 628 |
+
```python
|
| 629 |
+
from env.models import FlakySleuthAction
|
| 630 |
+
from graders.task1_grader import grade as grade_t1
|
| 631 |
+
from graders.task2_grader import grade as grade_t2
|
| 632 |
+
from graders.task3_grader import grade as grade_t3
|
| 633 |
+
|
| 634 |
+
def grade_action(action: FlakySleuthAction, task: dict) -> float:
|
| 635 |
+
tt = task["task_type"]
|
| 636 |
+
if tt == "classify":
|
| 637 |
+
return grade_t1(action, task)
|
| 638 |
+
elif tt == "root_cause":
|
| 639 |
+
return grade_t2(action, task)
|
| 640 |
+
elif tt == "fix_proposal":
|
| 641 |
+
return grade_t3(action, task)
|
| 642 |
+
return 0.0
|
| 643 |
+
```
|
| 644 |
+
|
| 645 |
+
### 7.2 Task 1 Grader (`graders/task1_grader.py`)
|
| 646 |
+
|
| 647 |
+
```python
|
| 648 |
+
from env.models import FlakySleuthAction
|
| 649 |
+
|
| 650 |
+
def grade(action: FlakySleuthAction, task: dict) -> float:
|
| 651 |
+
"""Binary classification: flaky or stable. Exact match only."""
|
| 652 |
+
if action.action_type != "classify_flakiness":
|
| 653 |
+
return 0.0
|
| 654 |
+
|
| 655 |
+
predicted = action.argument.strip().lower()
|
| 656 |
+
if predicted not in ("flaky", "stable"):
|
| 657 |
+
return 0.0
|
| 658 |
+
|
| 659 |
+
# All IDoFT rows are flaky; stable examples are synthetically added
|
| 660 |
+
# with label="stable" during dataset construction
|
| 661 |
+
ground_truth = task.get("label", "flaky")
|
| 662 |
+
return 1.0 if predicted == ground_truth else 0.0
|
| 663 |
+
```
|
| 664 |
+
|
| 665 |
+
### 7.3 Task 2 Grader (`graders/task2_grader.py`)
|
| 666 |
+
|
| 667 |
+
```python
|
| 668 |
+
import json
|
| 669 |
+
import os
|
| 670 |
+
from env.models import FlakySleuthAction
|
| 671 |
+
|
| 672 |
+
# Load similarity matrix once at module level
|
| 673 |
+
_SIM_PATH = os.path.join(os.path.dirname(__file__),
|
| 674 |
+
"..", "dataset", "category_similarity.json")
|
| 675 |
+
with open(_SIM_PATH) as f:
|
| 676 |
+
_RAW_SIM = json.load(f)
|
| 677 |
+
|
| 678 |
+
def _get_similarity(pred: str, true: str) -> float:
|
| 679 |
+
if pred == true:
|
| 680 |
+
return 1.0
|
| 681 |
+
key1 = f"{pred},{true}"
|
| 682 |
+
key2 = f"{true},{pred}"
|
| 683 |
+
return _RAW_SIM.get(key1, _RAW_SIM.get(key2, 0.0))
|
| 684 |
+
|
| 685 |
+
VALID_CATEGORIES = {
|
| 686 |
+
"OD", "OD-Brit", "OD-Vic", "NIO", "NOD",
|
| 687 |
+
"UD", "TD", "TZD", "ID", "NDOI", "NDOD", "OSD"
|
| 688 |
+
}
|
| 689 |
+
|
| 690 |
+
def grade(action: FlakySleuthAction, task: dict) -> float:
|
| 691 |
+
"""
|
| 692 |
+
Root cause category classification.
|
| 693 |
+
Exact match = 1.0
|
| 694 |
+
Related category = partial credit via similarity matrix
|
| 695 |
+
Wrong family = 0.0
|
| 696 |
+
"""
|
| 697 |
+
if action.action_type != "classify_root_cause":
|
| 698 |
+
return 0.0
|
| 699 |
+
|
| 700 |
+
predicted = action.argument.strip().upper()
|
| 701 |
+
|
| 702 |
+
# Handle common variations
|
| 703 |
+
predicted = predicted.replace(" ", "-") # "OD Brit" → "OD-Brit"
|
| 704 |
+
|
| 705 |
+
if predicted not in VALID_CATEGORIES:
|
| 706 |
+
return 0.0 # invalid category string
|
| 707 |
+
|
| 708 |
+
# Take primary category from dataset (first if semicolon-separated)
|
| 709 |
+
true_category = str(task.get("category", "")).split(";")[0].strip().upper()
|
| 710 |
+
|
| 711 |
+
return _get_similarity(predicted, true_category)
|
| 712 |
+
```
|
| 713 |
+
|
| 714 |
+
### 7.4 Task 3 Grader (`graders/task3_grader.py`)
|
| 715 |
+
|
| 716 |
+
```python
|
| 717 |
+
import subprocess
|
| 718 |
+
import tempfile
|
| 719 |
+
import os
|
| 720 |
+
import json
|
| 721 |
+
from openai import OpenAI
|
| 722 |
+
from env.models import FlakySleuthAction
|
| 723 |
+
|
| 724 |
+
CATEGORY_DESCRIPTIONS = {
|
| 725 |
+
"TD": "Time-Dependent: test fails due to reliance on wall-clock time",
|
| 726 |
+
"TZD": "Timezone-Dependent: test fails in different timezones",
|
| 727 |
+
"NOD": "Non-Deterministic: test fails due to randomness or non-determinism",
|
| 728 |
+
"NIO": "Non-Idempotent-Outcome: test passes first run but fails on second run",
|
| 729 |
+
"ID": "Implementation-Dependent: test fails due to language/runtime non-determinism (e.g. dict ordering)",
|
| 730 |
+
}
|
| 731 |
+
|
| 732 |
+
EXPECTED_FIX_PATTERNS = {
|
| 733 |
+
"TD": ["freeze_time", "mock", "patch", "utcnow", "datetime", "monkeypatch"],
|
| 734 |
+
"TZD": ["timezone", "utc", "pytz", "zoneinfo", "tzinfo", "UTC"],
|
| 735 |
+
"NOD": ["seed", "mock", "patch", "deterministic", "sorted"],
|
| 736 |
+
"NIO": ["setUp", "tearDown", "fixture", "yield", "cleanup", "autouse"],
|
| 737 |
+
"ID": ["sorted(", "list(", "frozenset", "OrderedDict"],
|
| 738 |
+
}
|
| 739 |
+
|
| 740 |
+
def grade(action: FlakySleuthAction, task: dict) -> float:
|
| 741 |
+
"""
|
| 742 |
+
Fix proposal grader.
|
| 743 |
+
Component A: Pattern check — 0.35 weight
|
| 744 |
+
Component B: Diff applies — 0.25 weight
|
| 745 |
+
Component C: LLM judge — 0.40 weight
|
| 746 |
+
"""
|
| 747 |
+
if action.action_type != "propose_fix":
|
| 748 |
+
return 0.0
|
| 749 |
+
|
| 750 |
+
proposed_fix = action.argument.strip()
|
| 751 |
+
if not proposed_fix:
|
| 752 |
+
return 0.0
|
| 753 |
+
|
| 754 |
+
category = str(task.get("category", "")).split(";")[0].strip().upper()
|
| 755 |
+
known_fix = task.get("known_fix_diff", "") or ""
|
| 756 |
+
test_code = task.get("test_code", "") or ""
|
| 757 |
+
|
| 758 |
+
# ── Component A: Pattern check ────────────────────────────────
|
| 759 |
+
patterns = EXPECTED_FIX_PATTERNS.get(category, [])
|
| 760 |
+
if patterns:
|
| 761 |
+
matches = sum(1 for p in patterns if p in proposed_fix)
|
| 762 |
+
pattern_score = min(1.0, matches / max(1, len(patterns) * 0.4))
|
| 763 |
+
else:
|
| 764 |
+
pattern_score = 0.5
|
| 765 |
+
|
| 766 |
+
# ── Component B: Diff applies cleanly ─────────────────────────
|
| 767 |
+
apply_score = _check_diff_applies(proposed_fix, task)
|
| 768 |
+
|
| 769 |
+
# ── Component C: LLM judge ────────────────────────────────────
|
| 770 |
+
judge_score = _llm_judge(proposed_fix, known_fix, category, test_code)
|
| 771 |
+
|
| 772 |
+
total = (0.35 * pattern_score) + (0.25 * apply_score) + (0.40 * judge_score)
|
| 773 |
+
return round(min(1.0, max(0.0, total)), 4)
|
| 774 |
+
|
| 775 |
+
|
| 776 |
+
def _check_diff_applies(fix: str, task: dict) -> float:
|
| 777 |
+
"""Try a dry-run patch application against the test file in a temp copy."""
|
| 778 |
+
try:
|
| 779 |
+
test_file = task.get("test_file", "")
|
| 780 |
+
sandbox_path = task.get("sandbox_test_path", "")
|
| 781 |
+
|
| 782 |
+
if not sandbox_path or not os.path.exists(sandbox_path):
|
| 783 |
+
return 0.3 # can't verify, neutral-ish
|
| 784 |
+
|
| 785 |
+
with tempfile.NamedTemporaryFile(mode="w", suffix=".patch", delete=False) as f:
|
| 786 |
+
f.write(fix)
|
| 787 |
+
patch_path = f.name
|
| 788 |
+
|
| 789 |
+
result = subprocess.run(
|
| 790 |
+
["patch", "--dry-run", "-p1", sandbox_path, patch_path],
|
| 791 |
+
capture_output=True, text=True, timeout=10
|
| 792 |
+
)
|
| 793 |
+
os.unlink(patch_path)
|
| 794 |
+
return 1.0 if result.returncode == 0 else 0.0
|
| 795 |
+
except Exception:
|
| 796 |
+
return 0.3 # can't verify, neutral
|
| 797 |
+
|
| 798 |
+
|
| 799 |
+
def _llm_judge(proposed: str, known: str, category: str, test_code: str) -> float:
|
| 800 |
+
"""Call the LLM judge via OpenAI-compatible API."""
|
| 801 |
+
client = OpenAI(
|
| 802 |
+
api_key=os.environ.get("OPENAI_API_KEY", ""),
|
| 803 |
+
base_url=os.environ.get("API_BASE_URL", "https://api.openai.com/v1"),
|
| 804 |
+
)
|
| 805 |
+
model = os.environ.get("MODEL_NAME", "gpt-4o-mini")
|
| 806 |
+
|
| 807 |
+
cat_desc = CATEGORY_DESCRIPTIONS.get(category, f"Flakiness category: {category}")
|
| 808 |
+
known_section = f"Known accepted fix (from merged PR):\n```\n{known[:800]}\n```" if known else "Known fix: Not available"
|
| 809 |
+
|
| 810 |
+
prompt = f"""You are evaluating a proposed fix for a flaky Python test.
|
| 811 |
+
|
| 812 |
+
Flakiness category: {category}
|
| 813 |
+
What this means: {cat_desc}
|
| 814 |
+
|
| 815 |
+
Original flaky test code:
|
| 816 |
+
```python
|
| 817 |
+
{test_code[:1000]}
|
| 818 |
+
```
|
| 819 |
+
|
| 820 |
+
Proposed fix (unified diff):
|
| 821 |
+
```
|
| 822 |
+
{proposed[:1000]}
|
| 823 |
+
```
|
| 824 |
+
|
| 825 |
+
{known_section}
|
| 826 |
+
|
| 827 |
+
Score the proposed fix from 0 to 10:
|
| 828 |
+
- 0–2: Fix is wrong, irrelevant, or makes things worse
|
| 829 |
+
- 3–5: Fix partially addresses the issue but misses root cause
|
| 830 |
+
- 6–8: Fix correctly addresses root cause with minor issues
|
| 831 |
+
- 9–10: Fix is correct, clean, minimal, and addresses root cause completely
|
| 832 |
+
|
| 833 |
+
Respond ONLY with a JSON object and nothing else:
|
| 834 |
+
{{"score": <integer 0-10>, "reason": "<one sentence explanation>"}}"""
|
| 835 |
+
|
| 836 |
+
try:
|
| 837 |
+
resp = client.chat.completions.create(
|
| 838 |
+
model=model,
|
| 839 |
+
messages=[{"role": "user", "content": prompt}],
|
| 840 |
+
max_tokens=100,
|
| 841 |
+
temperature=0.0,
|
| 842 |
+
)
|
| 843 |
+
raw = resp.choices[0].message.content.strip()
|
| 844 |
+
# Strip markdown fences if present
|
| 845 |
+
raw = raw.replace("```json", "").replace("```", "").strip()
|
| 846 |
+
data = json.loads(raw)
|
| 847 |
+
score = int(data["score"])
|
| 848 |
+
return max(0.0, min(10.0, score)) / 10.0
|
| 849 |
+
except Exception:
|
| 850 |
+
return 0.5 # fallback neutral on any failure
|
| 851 |
+
```
|
| 852 |
+
|
| 853 |
+
---
|
| 854 |
+
|
| 855 |
+
## 8. OpenEnv HTTP Server (`server.py`)
|
| 856 |
+
|
| 857 |
+
```python
|
| 858 |
+
from fastapi import FastAPI, HTTPException
|
| 859 |
+
from env.models import FlakySleuthObservation, FlakySleuthAction
|
| 860 |
+
from env.environment import FlakySleuthEnv
|
| 861 |
+
|
| 862 |
+
app = FastAPI(title="FlakySleuth Environment")
|
| 863 |
+
env = FlakySleuthEnv()
|
| 864 |
+
|
| 865 |
+
@app.post("/reset")
|
| 866 |
+
def reset() -> FlakySleuthObservation:
|
| 867 |
+
return env.reset()
|
| 868 |
+
|
| 869 |
+
@app.post("/step")
|
| 870 |
+
def step(action: FlakySleuthAction):
|
| 871 |
+
obs, reward, done, info = env.step(action)
|
| 872 |
+
return {
|
| 873 |
+
"observation": obs.dict(),
|
| 874 |
+
"reward": reward,
|
| 875 |
+
"done": done,
|
| 876 |
+
"info": info,
|
| 877 |
+
}
|
| 878 |
+
|
| 879 |
+
@app.get("/state")
|
| 880 |
+
def state():
|
| 881 |
+
return env.state()
|
| 882 |
+
|
| 883 |
+
@app.get("/health")
|
| 884 |
+
def health():
|
| 885 |
+
return {"status": "ok"}
|
| 886 |
+
|
| 887 |
+
if __name__ == "__main__":
|
| 888 |
+
import uvicorn
|
| 889 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
| 890 |
+
```
|
| 891 |
+
|
| 892 |
+
---
|
| 893 |
+
|
| 894 |
+
## 9. `openenv.yaml`
|
| 895 |
+
|
| 896 |
+
```yaml
|
| 897 |
+
name: flaky-sleuth-env
|
| 898 |
+
version: 0.1.0
|
| 899 |
+
description: >
|
| 900 |
+
An RL environment where an LLM agent investigates flaky tests in real
|
| 901 |
+
Python GitHub repositories. The agent uses tool calls to read code,
|
| 902 |
+
search for patterns, and run tests — then produces a verdict (classify,
|
| 903 |
+
root cause, or fix). Tasks range from binary flakiness classification
|
| 904 |
+
to proposing concrete code fixes verified by a hybrid grader.
|
| 905 |
+
|
| 906 |
+
observation_type: FlakySleuthObservation
|
| 907 |
+
action_type: FlakySleuthAction
|
| 908 |
+
reward_range: [0.0, 1.0]
|
| 909 |
+
|
| 910 |
+
tasks:
|
| 911 |
+
- id: task1_classify
|
| 912 |
+
name: "Flaky vs. Stable Classification"
|
| 913 |
+
difficulty: easy
|
| 914 |
+
description: >
|
| 915 |
+
Given a test from a real Python repo, classify it as flaky or stable.
|
| 916 |
+
Agent must call classify_flakiness with argument 'flaky' or 'stable'.
|
| 917 |
+
|
| 918 |
+
- id: task2_root_cause
|
| 919 |
+
name: "Root Cause Category Identification"
|
| 920 |
+
difficulty: medium
|
| 921 |
+
description: >
|
| 922 |
+
Given a confirmed flaky test, identify the root cause category
|
| 923 |
+
(OD, NOD, TD, TZD, NIO, ID, etc.) via static code analysis.
|
| 924 |
+
|
| 925 |
+
- id: task3_fix_proposal
|
| 926 |
+
name: "Fix Proposal"
|
| 927 |
+
difficulty: hard
|
| 928 |
+
description: >
|
| 929 |
+
Given a confirmed flaky test and its root cause, propose a concrete
|
| 930 |
+
fix as a unified diff. Evaluated by pattern matching + LLM judge.
|
| 931 |
+
|
| 932 |
+
episode_max_steps: 20
|
| 933 |
+
baseline_script: inference.py
|
| 934 |
+
|
| 935 |
+
infra:
|
| 936 |
+
vcpu: 2
|
| 937 |
+
memory_gb: 8
|
| 938 |
+
max_inference_minutes: 20
|
| 939 |
+
```
|
| 940 |
+
|
| 941 |
+
---
|
| 942 |
+
|
| 943 |
+
## 10. Baseline Inference Script (`inference.py`)
|
| 944 |
+
|
| 945 |
+
**CRITICAL:** Must be named exactly `inference.py` in the root directory. Must use OpenAI client. Must read `API_BASE_URL`, `MODEL_NAME`, `OPENAI_API_KEY` from environment variables.
|
| 946 |
+
|
| 947 |
+
```python
|
| 948 |
+
"""
|
| 949 |
+
FlakySleuth baseline inference script.
|
| 950 |
+
|
| 951 |
+
Required environment variables:
|
| 952 |
+
OPENAI_API_KEY — API key
|
| 953 |
+
API_BASE_URL — LLM endpoint (default: https://api.openai.com/v1)
|
| 954 |
+
MODEL_NAME — Model identifier (default: gpt-4o-mini)
|
| 955 |
+
|
| 956 |
+
Runs 5 episodes × 3 task types = 15 total episodes.
|
| 957 |
+
Prints average score per task type.
|
| 958 |
+
Must complete in under 20 minutes on vcpu=2, 8GB RAM.
|
| 959 |
+
"""
|
| 960 |
+
|
| 961 |
+
import os
|
| 962 |
+
import json
|
| 963 |
+
from openai import OpenAI
|
| 964 |
+
from env.environment import FlakySleuthEnv
|
| 965 |
+
from env.models import FlakySleuthAction
|
| 966 |
+
|
| 967 |
+
# ── Configuration ──────────────────────────────────────────────────
|
| 968 |
+
API_KEY = os.environ.get("OPENAI_API_KEY", "")
|
| 969 |
+
API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
|
| 970 |
+
MODEL_NAME = os.environ.get("MODEL_NAME", "gpt-4o-mini")
|
| 971 |
+
EPISODES_PER_TASK = 5
|
| 972 |
+
|
| 973 |
+
client = OpenAI(api_key=API_KEY, base_url=API_BASE_URL)
|
| 974 |
+
|
| 975 |
+
# ── System prompt (teaches the model your tool interface) ──────────
|
| 976 |
+
SYSTEM_PROMPT = """You are a flaky test detective. You investigate Python tests in real GitHub repositories.
|
| 977 |
+
|
| 978 |
+
At each step, respond ONLY with a single valid JSON object — no explanation, no markdown, no extra text.
|
| 979 |
+
|
| 980 |
+
Available actions:
|
| 981 |
+
|
| 982 |
+
EXPLORATORY (use these to gather evidence):
|
| 983 |
+
{"action_type": "read_file", "argument": "relative/path/to/file.py"}
|
| 984 |
+
{"action_type": "search_code", "argument": "pattern_to_grep_for"}
|
| 985 |
+
{"action_type": "run_test", "argument": ""}
|
| 986 |
+
|
| 987 |
+
TERMINAL (use exactly one of these to end the episode):
|
| 988 |
+
{"action_type": "classify_flakiness", "argument": "flaky"}
|
| 989 |
+
{"action_type": "classify_flakiness", "argument": "stable"}
|
| 990 |
+
{"action_type": "classify_root_cause", "argument": "OD"}
|
| 991 |
+
{"action_type": "classify_root_cause", "argument": "NOD"}
|
| 992 |
+
{"action_type": "classify_root_cause", "argument": "TD"}
|
| 993 |
+
{"action_type": "classify_root_cause", "argument": "TZD"}
|
| 994 |
+
{"action_type": "classify_root_cause", "argument": "NIO"}
|
| 995 |
+
{"action_type": "classify_root_cause", "argument": "ID"}
|
| 996 |
+
{"action_type": "classify_root_cause", "argument": "OD-Brit"}
|
| 997 |
+
{"action_type": "classify_root_cause", "argument": "OD-Vic"}
|
| 998 |
+
{"action_type": "propose_fix", "argument": "--- a/path\\n+++ b/path\\n@@ ... @@\\n-old line\\n+new line"}
|
| 999 |
+
|
| 1000 |
+
RULES:
|
| 1001 |
+
1. Always read the test file first before making a terminal decision.
|
| 1002 |
+
2. Search for flakiness signals: sleep, random, time, datetime, thread, os.environ, shared state.
|
| 1003 |
+
3. For order-dependent (OD) tests, run_test is disabled — use static analysis only.
|
| 1004 |
+
4. Call a terminal action only when you have enough evidence.
|
| 1005 |
+
5. Respond with ONLY valid JSON. Nothing else."""
|
| 1006 |
+
|
| 1007 |
+
|
| 1008 |
+
def obs_to_prompt(obs) -> str:
|
| 1009 |
+
return f"""TASK: {obs.task_description}
|
| 1010 |
+
|
| 1011 |
+
Repository: {obs.repo_url}
|
| 1012 |
+
Test name: {obs.test_name}
|
| 1013 |
+
Step: {obs.step_count}/20
|
| 1014 |
+
|
| 1015 |
+
Test source code:
|
| 1016 |
+
```python
|
| 1017 |
+
{obs.test_code}
|
| 1018 |
+
```
|
| 1019 |
+
|
| 1020 |
+
Repository file tree (top-level):
|
| 1021 |
+
{chr(10).join(obs.file_tree[:40])}
|
| 1022 |
+
|
| 1023 |
+
Result of your last action:
|
| 1024 |
+
{obs.tool_output or "(No action taken yet — this is the start of the episode)"}
|
| 1025 |
+
|
| 1026 |
+
What is your next action? Respond with JSON only."""
|
| 1027 |
+
|
| 1028 |
+
|
| 1029 |
+
def run_episode(env: FlakySleuthEnv) -> float:
|
| 1030 |
+
obs = env.reset()
|
| 1031 |
+
messages = [
|
| 1032 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 1033 |
+
{"role": "user", "content": obs_to_prompt(obs)},
|
| 1034 |
+
]
|
| 1035 |
+
total_reward = 0.0
|
| 1036 |
+
|
| 1037 |
+
for step in range(20):
|
| 1038 |
+
try:
|
| 1039 |
+
resp = client.chat.completions.create(
|
| 1040 |
+
model=MODEL_NAME,
|
| 1041 |
+
messages=messages,
|
| 1042 |
+
max_tokens=400,
|
| 1043 |
+
temperature=0.0,
|
| 1044 |
+
)
|
| 1045 |
+
raw = resp.choices[0].message.content.strip()
|
| 1046 |
+
messages.append({"role": "assistant", "content": raw})
|
| 1047 |
+
|
| 1048 |
+
# Parse action
|
| 1049 |
+
clean = raw.replace("```json", "").replace("```", "").strip()
|
| 1050 |
+
action_dict = json.loads(clean)
|
| 1051 |
+
action = FlakySleuthAction(**action_dict)
|
| 1052 |
+
|
| 1053 |
+
except json.JSONDecodeError:
|
| 1054 |
+
# Model produced non-JSON — inject correction message
|
| 1055 |
+
messages.append({
|
| 1056 |
+
"role": "user",
|
| 1057 |
+
"content": "ERROR: Your response was not valid JSON. "
|
| 1058 |
+
"Respond ONLY with a JSON object as specified."
|
| 1059 |
+
})
|
| 1060 |
+
continue
|
| 1061 |
+
except Exception as e:
|
| 1062 |
+
print(f" Step {step} error: {e}")
|
| 1063 |
+
break
|
| 1064 |
+
|
| 1065 |
+
obs, reward, done, info = env.step(action)
|
| 1066 |
+
total_reward += reward
|
| 1067 |
+
|
| 1068 |
+
if done:
|
| 1069 |
+
print(f" Terminal: {action.action_type}({action.argument[:50]}) "
|
| 1070 |
+
f"→ terminal={info.get('terminal_score', 0):.2f} "
|
| 1071 |
+
f"progress={info.get('progress_score', 0):.2f} "
|
| 1072 |
+
f"total={total_reward:.2f}")
|
| 1073 |
+
break
|
| 1074 |
+
|
| 1075 |
+
messages.append({"role": "user", "content": obs_to_prompt(obs)})
|
| 1076 |
+
|
| 1077 |
+
return total_reward
|
| 1078 |
+
|
| 1079 |
+
|
| 1080 |
+
def main():
|
| 1081 |
+
env = FlakySleuthEnv()
|
| 1082 |
+
results = {"classify": [], "root_cause": [], "fix_proposal": []}
|
| 1083 |
+
|
| 1084 |
+
for task_type in results.keys():
|
| 1085 |
+
print(f"\n── Task type: {task_type} ──")
|
| 1086 |
+
env.loader.force_task_type(task_type)
|
| 1087 |
+
for ep in range(EPISODES_PER_TASK):
|
| 1088 |
+
score = run_episode(env)
|
| 1089 |
+
results[task_type].append(score)
|
| 1090 |
+
print(f" Episode {ep+1}: {score:.3f}")
|
| 1091 |
+
|
| 1092 |
+
print("\n══ BASELINE RESULTS ══")
|
| 1093 |
+
for task_type, scores in results.items():
|
| 1094 |
+
avg = sum(scores) / len(scores)
|
| 1095 |
+
print(f" {task_type:15s}: avg={avg:.3f} scores={[round(s,3) for s in scores]}")
|
| 1096 |
+
|
| 1097 |
+
overall = sum(s for scores in results.values() for s in scores)
|
| 1098 |
+
overall /= sum(len(v) for v in results.values())
|
| 1099 |
+
print(f" {'OVERALL':15s}: avg={overall:.3f}")
|
| 1100 |
+
|
| 1101 |
+
|
| 1102 |
+
if __name__ == "__main__":
|
| 1103 |
+
main()
|
| 1104 |
+
```
|
| 1105 |
+
|
| 1106 |
+
---
|
| 1107 |
+
|
| 1108 |
+
## 11. Dockerfile
|
| 1109 |
+
|
| 1110 |
+
```dockerfile
|
| 1111 |
+
FROM python:3.11-slim
|
| 1112 |
+
|
| 1113 |
+
# Install git and patch (needed for sandbox)
|
| 1114 |
+
RUN apt-get update && apt-get install -y \
|
| 1115 |
+
git \
|
| 1116 |
+
patch \
|
| 1117 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 1118 |
+
|
| 1119 |
+
WORKDIR /app
|
| 1120 |
+
|
| 1121 |
+
# Copy requirements first for layer caching
|
| 1122 |
+
COPY requirements.txt .
|
| 1123 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 1124 |
+
|
| 1125 |
+
# Copy everything else
|
| 1126 |
+
COPY . .
|
| 1127 |
+
|
| 1128 |
+
# Expose port for HF Spaces
|
| 1129 |
+
EXPOSE 7860
|
| 1130 |
+
|
| 1131 |
+
# Start FastAPI server
|
| 1132 |
+
CMD ["python", "server.py"]
|
| 1133 |
+
```
|
| 1134 |
+
|
| 1135 |
+
---
|
| 1136 |
+
|
| 1137 |
+
## 12. `requirements.txt`
|
| 1138 |
+
|
| 1139 |
+
```
|
| 1140 |
+
fastapi>=0.110.0
|
| 1141 |
+
uvicorn>=0.27.0
|
| 1142 |
+
pydantic>=2.0.0
|
| 1143 |
+
openai>=1.0.0
|
| 1144 |
+
pandas>=2.0.0
|
| 1145 |
+
gitpython>=3.1.0
|
| 1146 |
+
pytest>=7.0.0
|
| 1147 |
+
pytest-timeout>=2.0.0
|
| 1148 |
+
requests>=2.31.0
|
| 1149 |
+
```
|
| 1150 |
+
|
| 1151 |
+
---
|
| 1152 |
+
|
| 1153 |
+
## 13. Build Order (Day-by-Day Sprint)
|
| 1154 |
+
|
| 1155 |
+
```
|
| 1156 |
+
DAY 1 — Data Foundation
|
| 1157 |
+
────────────────────────
|
| 1158 |
+
□ Clone idoft repo, inspect py-data.csv manually
|
| 1159 |
+
□ Run build_dataset.py offline (set GITHUB_TOKEN)
|
| 1160 |
+
□ Verify py_tasks.csv has rows for all 3 task types
|
| 1161 |
+
□ Manually inspect 5-10 rows to sanity check test_code and known_fix_diff
|
| 1162 |
+
□ Build category_similarity.json
|
| 1163 |
+
|
| 1164 |
+
DAY 2 — Core Environment
|
| 1165 |
+
──────────────────────────
|
| 1166 |
+
□ Implement env/models.py (Pydantic models)
|
| 1167 |
+
□ Implement env/sandbox.py (clone, read_file, grep, run_test)
|
| 1168 |
+
□ Test sandbox.py manually on 2-3 real repos
|
| 1169 |
+
□ Implement env/task_loader.py
|
| 1170 |
+
□ Implement env/environment.py (reset, step, state)
|
| 1171 |
+
□ Write a quick smoke test: reset() → 3 steps → terminal action
|
| 1172 |
+
|
| 1173 |
+
DAY 3 — Graders
|
| 1174 |
+
────────────────
|
| 1175 |
+
□ Implement graders/task1_grader.py
|
| 1176 |
+
□ Implement graders/task2_grader.py + verify similarity matrix
|
| 1177 |
+
□ Implement graders/task3_grader.py (pattern + diff + LLM judge)
|
| 1178 |
+
□ Unit test all 3 graders with hardcoded inputs
|
| 1179 |
+
□ Verify scores are always in [0.0, 1.0]
|
| 1180 |
+
|
| 1181 |
+
DAY 4 — Server + Spec Compliance
|
| 1182 |
+
──────────────────────────────────
|
| 1183 |
+
□ Implement server.py (FastAPI: /reset, /step, /state, /health)
|
| 1184 |
+
□ Write openenv.yaml
|
| 1185 |
+
□ Run openenv validate — fix any errors
|
| 1186 |
+
□ Build Dockerfile locally: docker build . && docker run -p 7860:7860
|
| 1187 |
+
□ Test endpoints with curl
|
| 1188 |
+
|
| 1189 |
+
DAY 5 — Inference Script + Deploy
|
| 1190 |
+
────────────────────────────────────
|
| 1191 |
+
□ Implement inference.py (ReAct loop, OpenAI client)
|
| 1192 |
+
□ Run inference.py locally against real API
|
| 1193 |
+
□ Verify it completes in <20 min, produces scores for all 3 task types
|
| 1194 |
+
□ Deploy to Hugging Face Spaces
|
| 1195 |
+
□ Verify HF Space returns 200 on health check and responds to reset()
|
| 1196 |
+
□ Run pre-submission validation script
|
| 1197 |
+
|
| 1198 |
+
DAY 6 — Polish + Submit
|
| 1199 |
+
─────────────────────────
|
| 1200 |
+
□ Write README (env description, observation/action spaces, setup)
|
| 1201 |
+
□ Run full baseline one more time, record scores
|
| 1202 |
+
□ Submit HF Space URL before April 8 11:59 PM IST
|
| 1203 |
+
```
|
| 1204 |
+
|
| 1205 |
+
---
|
| 1206 |
+
|
| 1207 |
+
## 14. Pre-Submission Checklist (from Official Spec)
|
| 1208 |
+
|
| 1209 |
+
```
|
| 1210 |
+
□ HF Space deploys and returns 200 on automated ping
|
| 1211 |
+
□ reset() responds correctly
|
| 1212 |
+
□ openenv validate passes (openenv.yaml + typed models + step/reset/state)
|
| 1213 |
+
□ docker build succeeds on submitted repo
|
| 1214 |
+
□ inference.py runs without error and produces scores
|
| 1215 |
+
□ 3 tasks with graders, all scores in 0.0–1.0
|
| 1216 |
+
□ API_BASE_URL, MODEL_NAME, OPENROUTER_API_KEY env vars defined
|
| 1217 |
+
□ Inference script is named exactly inference.py in root directory
|
| 1218 |
+
□ All LLM calls use OpenAI client with those env vars
|
| 1219 |
+
□ Runtime < 20 min on vcpu=2, 8GB RAM
|
| 1220 |
+
```
|
| 1221 |
+
|
| 1222 |
+
---
|
| 1223 |
+
|
| 1224 |
+
## 15. Key Design Decisions Summary (for context)
|
| 1225 |
+
|
| 1226 |
+
| Decision | Choice | Reason |
|
| 1227 |
+
|---|---|---|
|
| 1228 |
+
| Language | Python only | Fast sandboxing, clean IDoFT data, no JVM overhead |
|
| 1229 |
+
| Dataset | IDoFT py-data.csv + category codes | Real repos, ground truth categories, PR-linked fixes |
|
| 1230 |
+
| OD tests in T3 | Excluded | Cannot verify fix without multi-order test execution |
|
| 1231 |
+
| OD tests in T1/T2 | Included | Static code analysis is a valid proxy |
|
| 1232 |
+
| T2 grader | Similarity matrix | Some wrong answers are more wrong than others |
|
| 1233 |
+
| T3 grader | Hybrid (pattern + diff + LLM judge) | Pure string match unfair; pure LLM judge non-deterministic |
|
| 1234 |
+
| Reward shaping | Step-level progress rewards | Prevents sparse reward, rewards good investigative behavior |
|
| 1235 |
+
| Max steps | 20 | Balances exploration depth vs infra time constraints |
|
| 1236 |
+
| Progress reward cap | 0.30 | Terminal score (0.70 max) dominates; exploration is supporting signal |
|
graders/__init__.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from env.models import FlakySleuthAction
|
| 4 |
+
from graders.task1_grader import grade as grade_t1
|
| 5 |
+
from graders.task2_grader import grade as grade_t2
|
| 6 |
+
from graders.task3_grader import grade as grade_t3
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def grade_action(action: FlakySleuthAction, task: dict) -> float:
|
| 10 |
+
task_type = task.get("task_type")
|
| 11 |
+
if task_type == "classify":
|
| 12 |
+
return grade_t1(action, task)
|
| 13 |
+
if task_type == "root_cause":
|
| 14 |
+
return grade_t2(action, task)
|
| 15 |
+
if task_type == "fix_proposal":
|
| 16 |
+
return grade_t3(action, task)
|
| 17 |
+
return 0.0
|
graders/task1_grader.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from env.models import FlakySleuthAction
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def grade(action: FlakySleuthAction, task: dict) -> float:
|
| 7 |
+
"""Binary classification: flaky or stable. Exact match only."""
|
| 8 |
+
if action.action_type != "classify_flakiness":
|
| 9 |
+
return 0.0
|
| 10 |
+
|
| 11 |
+
predicted = action.argument.strip().lower()
|
| 12 |
+
if predicted not in ("flaky", "stable"):
|
| 13 |
+
return 0.0
|
| 14 |
+
|
| 15 |
+
ground_truth = str(task.get("label", "flaky")).strip().lower() or "flaky"
|
| 16 |
+
return 1.0 if predicted == ground_truth else 0.0
|
graders/task2_grader.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
from env.models import FlakySleuthAction
|
| 7 |
+
|
| 8 |
+
_SIM_PATH = Path(__file__).resolve().parent.parent / "dataset" / "category_similarity.json"
|
| 9 |
+
with _SIM_PATH.open("r", encoding="utf-8") as handle:
|
| 10 |
+
_RAW_SIM = json.load(handle)
|
| 11 |
+
|
| 12 |
+
_CANONICAL = {
|
| 13 |
+
"OD": "OD",
|
| 14 |
+
"OD-BRIT": "OD-Brit",
|
| 15 |
+
"OD-VIC": "OD-Vic",
|
| 16 |
+
"NIO": "NIO",
|
| 17 |
+
"NOD": "NOD",
|
| 18 |
+
"UD": "UD",
|
| 19 |
+
"TD": "TD",
|
| 20 |
+
"TZD": "TZD",
|
| 21 |
+
"ID": "ID",
|
| 22 |
+
"NDOI": "NDOI",
|
| 23 |
+
"NDOD": "NDOD",
|
| 24 |
+
"OSD": "OSD",
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
VALID_CATEGORIES = set(_CANONICAL.values())
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def _normalize_category(value: str) -> str:
|
| 32 |
+
text = value.strip().replace("_", "-").replace(" ", "-")
|
| 33 |
+
upper = text.upper()
|
| 34 |
+
return _CANONICAL.get(upper, "")
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def _get_similarity(predicted: str, truth: str) -> float:
|
| 38 |
+
if predicted == truth:
|
| 39 |
+
return 1.0
|
| 40 |
+
key_a = f"{predicted},{truth}"
|
| 41 |
+
key_b = f"{truth},{predicted}"
|
| 42 |
+
return float(_RAW_SIM.get(key_a, _RAW_SIM.get(key_b, 0.0)))
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def grade(action: FlakySleuthAction, task: dict) -> float:
|
| 46 |
+
"""Root cause category classification with matrix-based partial credit."""
|
| 47 |
+
if action.action_type != "classify_root_cause":
|
| 48 |
+
return 0.0
|
| 49 |
+
|
| 50 |
+
predicted = _normalize_category(action.argument)
|
| 51 |
+
if predicted not in VALID_CATEGORIES:
|
| 52 |
+
return 0.0
|
| 53 |
+
|
| 54 |
+
raw_truth = str(task.get("category", "")).split(";")[0]
|
| 55 |
+
truth = _normalize_category(raw_truth)
|
| 56 |
+
if truth not in VALID_CATEGORIES:
|
| 57 |
+
return 0.0
|
| 58 |
+
|
| 59 |
+
return _get_similarity(predicted, truth)
|
graders/task3_grader.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
import subprocess
|
| 6 |
+
import tempfile
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
from openai import OpenAI
|
| 10 |
+
|
| 11 |
+
from env.models import FlakySleuthAction
|
| 12 |
+
|
| 13 |
+
CATEGORY_DESCRIPTIONS = {
|
| 14 |
+
"TD": "Time-Dependent: fails due to wall-clock time assumptions",
|
| 15 |
+
"TZD": "Timezone-Dependent: fails across timezone settings",
|
| 16 |
+
"NOD": "Non-Deterministic: fails due to randomness/non-determinism",
|
| 17 |
+
"NIO": "Non-Idempotent-Outcome: passes first run, fails on repeated run",
|
| 18 |
+
"ID": "Implementation-Dependent: fails due to runtime implementation details",
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
EXPECTED_FIX_PATTERNS = {
|
| 22 |
+
"TD": ["freeze_time", "mock", "patch", "utcnow", "datetime", "monkeypatch"],
|
| 23 |
+
"TZD": ["timezone", "utc", "pytz", "zoneinfo", "tzinfo", "UTC"],
|
| 24 |
+
"NOD": ["seed", "mock", "patch", "deterministic", "sorted"],
|
| 25 |
+
"NIO": ["setup", "teardown", "fixture", "yield", "cleanup", "autouse"],
|
| 26 |
+
"ID": ["sorted(", "list(", "frozenset", "OrderedDict"],
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def grade(action: FlakySleuthAction, task: dict) -> float:
|
| 31 |
+
"""Hybrid fixer grader: pattern + dry-run apply + LLM judge."""
|
| 32 |
+
if action.action_type != "propose_fix":
|
| 33 |
+
return 0.0
|
| 34 |
+
|
| 35 |
+
proposed_fix = action.argument.strip()
|
| 36 |
+
if not proposed_fix:
|
| 37 |
+
return 0.0
|
| 38 |
+
|
| 39 |
+
category = str(task.get("category", "")).split(";")[0].strip().upper()
|
| 40 |
+
known_fix = task.get("known_fix_diff", "") or ""
|
| 41 |
+
test_code = task.get("test_code", "") or ""
|
| 42 |
+
|
| 43 |
+
patterns = EXPECTED_FIX_PATTERNS.get(category, [])
|
| 44 |
+
if patterns:
|
| 45 |
+
matches = sum(
|
| 46 |
+
1 for pattern in patterns if pattern.lower() in proposed_fix.lower()
|
| 47 |
+
)
|
| 48 |
+
pattern_score = min(1.0, matches / max(1, len(patterns) * 0.4))
|
| 49 |
+
else:
|
| 50 |
+
pattern_score = 0.5
|
| 51 |
+
|
| 52 |
+
apply_score = _check_diff_applies(proposed_fix, task)
|
| 53 |
+
judge_score = _llm_judge(proposed_fix, known_fix, category, test_code)
|
| 54 |
+
|
| 55 |
+
total = (0.35 * pattern_score) + (0.25 * apply_score) + (0.40 * judge_score)
|
| 56 |
+
return round(min(1.0, max(0.0, total)), 4)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def _check_diff_applies(diff_text: str, task: dict) -> float:
|
| 60 |
+
if "+++" not in diff_text or "---" not in diff_text:
|
| 61 |
+
return 0.0
|
| 62 |
+
|
| 63 |
+
repo_root = str(task.get("sandbox_root", "")).strip()
|
| 64 |
+
if not repo_root or not Path(repo_root).exists():
|
| 65 |
+
return 0.3
|
| 66 |
+
|
| 67 |
+
patch_path = None
|
| 68 |
+
try:
|
| 69 |
+
with tempfile.NamedTemporaryFile(
|
| 70 |
+
mode="w", suffix=".patch", delete=False
|
| 71 |
+
) as handle:
|
| 72 |
+
handle.write(diff_text)
|
| 73 |
+
patch_path = handle.name
|
| 74 |
+
|
| 75 |
+
result = subprocess.run(
|
| 76 |
+
["patch", "--dry-run", "-p1", "-i", patch_path],
|
| 77 |
+
cwd=repo_root,
|
| 78 |
+
capture_output=True,
|
| 79 |
+
text=True,
|
| 80 |
+
timeout=10,
|
| 81 |
+
)
|
| 82 |
+
return 1.0 if result.returncode == 0 else 0.0
|
| 83 |
+
except Exception:
|
| 84 |
+
return 0.3
|
| 85 |
+
finally:
|
| 86 |
+
if patch_path and os.path.exists(patch_path):
|
| 87 |
+
os.unlink(patch_path)
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def _llm_judge(proposed: str, known: str, category: str, test_code: str) -> float:
|
| 91 |
+
openrouter_key = os.environ.get("OPENROUTER_API_KEY")
|
| 92 |
+
openai_key = os.environ.get("OPENAI_API_KEY")
|
| 93 |
+
raw_api_key = os.environ.get("API_KEY")
|
| 94 |
+
api_key = (raw_api_key or openrouter_key or openai_key or "").strip()
|
| 95 |
+
if not api_key:
|
| 96 |
+
return 0.5
|
| 97 |
+
|
| 98 |
+
using_openrouter = (openrouter_key and not raw_api_key and not openai_key) or (
|
| 99 |
+
raw_api_key and raw_api_key.startswith("sk-or-") and not openai_key
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
default_base_url = (
|
| 103 |
+
"https://openrouter.ai/api/v1"
|
| 104 |
+
if using_openrouter
|
| 105 |
+
else "https://api.openai.com/v1"
|
| 106 |
+
)
|
| 107 |
+
api_base_url = os.environ.get("API_BASE_URL", default_base_url)
|
| 108 |
+
client = OpenAI(api_key=api_key, base_url=api_base_url)
|
| 109 |
+
model = os.environ.get(
|
| 110 |
+
"MODEL_NAME",
|
| 111 |
+
"qwen/qwen3.6-plus:free"
|
| 112 |
+
if api_base_url.startswith("https://openrouter.ai")
|
| 113 |
+
else "gpt-4o-mini",
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
cat_desc = CATEGORY_DESCRIPTIONS.get(category, f"Flakiness category: {category}")
|
| 117 |
+
if known:
|
| 118 |
+
known_section = f"Known accepted fix (from merged PR):\n```\n{known[:800]}\n```"
|
| 119 |
+
else:
|
| 120 |
+
known_section = "Known fix: Not available"
|
| 121 |
+
|
| 122 |
+
prompt = f"""You are evaluating a proposed fix for a flaky Python test.
|
| 123 |
+
|
| 124 |
+
Flakiness category: {category}
|
| 125 |
+
What this means: {cat_desc}
|
| 126 |
+
|
| 127 |
+
Original flaky test code:
|
| 128 |
+
```python
|
| 129 |
+
{test_code[:1000]}
|
| 130 |
+
```
|
| 131 |
+
|
| 132 |
+
Proposed fix (unified diff):
|
| 133 |
+
```
|
| 134 |
+
{proposed[:1000]}
|
| 135 |
+
```
|
| 136 |
+
|
| 137 |
+
{known_section}
|
| 138 |
+
|
| 139 |
+
Score the proposed fix from 0 to 10:
|
| 140 |
+
- 0-2: Fix is wrong, irrelevant, or harmful
|
| 141 |
+
- 3-5: Fix partially addresses the issue but misses root cause
|
| 142 |
+
- 6-8: Fix addresses root cause with minor issues
|
| 143 |
+
- 9-10: Fix is correct, minimal, and complete
|
| 144 |
+
|
| 145 |
+
Respond ONLY with JSON:
|
| 146 |
+
{{"score": <integer 0-10>, "reason": "<one sentence>"}}"""
|
| 147 |
+
|
| 148 |
+
try:
|
| 149 |
+
response = client.chat.completions.create(
|
| 150 |
+
model=model,
|
| 151 |
+
messages=[{"role": "user", "content": prompt}],
|
| 152 |
+
max_tokens=120,
|
| 153 |
+
temperature=0.0,
|
| 154 |
+
)
|
| 155 |
+
raw = (response.choices[0].message.content or "").strip()
|
| 156 |
+
raw = raw.replace("```json", "").replace("```", "").strip()
|
| 157 |
+
payload = json.loads(raw)
|
| 158 |
+
score = int(payload.get("score", 5))
|
| 159 |
+
return max(0.0, min(10.0, score)) / 10.0
|
| 160 |
+
except Exception:
|
| 161 |
+
return 0.5
|
inference.py
ADDED
|
@@ -0,0 +1,298 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""FlakySleuth compliance inference script.
|
| 2 |
+
"""
|
| 3 |
+
|
| 4 |
+
from __future__ import annotations
|
| 5 |
+
|
| 6 |
+
import argparse
|
| 7 |
+
import json
|
| 8 |
+
import os
|
| 9 |
+
from typing import Any
|
| 10 |
+
|
| 11 |
+
from openai import OpenAI
|
| 12 |
+
|
| 13 |
+
from env.environment import FlakySleuthEnv
|
| 14 |
+
from env.models import FlakySleuthAction, FlakySleuthObservation
|
| 15 |
+
|
| 16 |
+
HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN")
|
| 17 |
+
OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY")
|
| 18 |
+
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
|
| 19 |
+
RAW_API_KEY = os.environ.get("API_KEY")
|
| 20 |
+
API_KEY = RAW_API_KEY or HF_TOKEN or OPENROUTER_API_KEY or OPENAI_API_KEY or ""
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def _looks_like_openrouter_key(key: str | None) -> bool:
|
| 24 |
+
return bool(key and key.startswith("sk-or-"))
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
DEFAULT_BASE_URL = (
|
| 28 |
+
"https://router.huggingface.co/v1"
|
| 29 |
+
if (HF_TOKEN and not RAW_API_KEY and not OPENROUTER_API_KEY and not OPENAI_API_KEY)
|
| 30 |
+
else (
|
| 31 |
+
"https://openrouter.ai/api/v1"
|
| 32 |
+
if (
|
| 33 |
+
(OPENROUTER_API_KEY and not RAW_API_KEY and not OPENAI_API_KEY)
|
| 34 |
+
or (_looks_like_openrouter_key(RAW_API_KEY) and not OPENAI_API_KEY)
|
| 35 |
+
)
|
| 36 |
+
else "https://api.openai.com/v1"
|
| 37 |
+
)
|
| 38 |
+
)
|
| 39 |
+
API_BASE_URL = os.environ.get("API_BASE_URL", DEFAULT_BASE_URL)
|
| 40 |
+
|
| 41 |
+
DEFAULT_MODEL = (
|
| 42 |
+
"openai/gpt-oss-120b:novita"
|
| 43 |
+
if API_BASE_URL.startswith("https://router.huggingface.co")
|
| 44 |
+
else ("qwen/qwen3.6-plus:free" if API_BASE_URL.startswith("https://openrouter.ai") else "gpt-4o-mini")
|
| 45 |
+
)
|
| 46 |
+
MODEL_NAME = os.environ.get("MODEL_NAME", DEFAULT_MODEL)
|
| 47 |
+
|
| 48 |
+
EPISODES_PER_TASK = 5
|
| 49 |
+
MAX_STEPS = 20
|
| 50 |
+
BENCHMARK_NAME = "flakysleuth"
|
| 51 |
+
|
| 52 |
+
client = OpenAI(api_key=API_KEY, base_url=API_BASE_URL)
|
| 53 |
+
|
| 54 |
+
SYSTEM_PROMPT = """You are a flaky test detective.
|
| 55 |
+
|
| 56 |
+
Respond ONLY with a single valid JSON object.
|
| 57 |
+
|
| 58 |
+
Exploration actions:
|
| 59 |
+
{"action_type": "read_file", "argument": "relative/path.py"}
|
| 60 |
+
{"action_type": "search_code", "argument": "pattern"}
|
| 61 |
+
{"action_type": "run_test", "argument": ""}
|
| 62 |
+
|
| 63 |
+
Terminal actions:
|
| 64 |
+
{"action_type": "classify_flakiness", "argument": "flaky"}
|
| 65 |
+
{"action_type": "classify_flakiness", "argument": "stable"}
|
| 66 |
+
{"action_type": "classify_root_cause", "argument": "OD"}
|
| 67 |
+
{"action_type": "classify_root_cause", "argument": "OD-Brit"}
|
| 68 |
+
{"action_type": "classify_root_cause", "argument": "OD-Vic"}
|
| 69 |
+
{"action_type": "classify_root_cause", "argument": "NIO"}
|
| 70 |
+
{"action_type": "classify_root_cause", "argument": "NOD"}
|
| 71 |
+
{"action_type": "classify_root_cause", "argument": "TD"}
|
| 72 |
+
{"action_type": "classify_root_cause", "argument": "TZD"}
|
| 73 |
+
{"action_type": "classify_root_cause", "argument": "ID"}
|
| 74 |
+
{"action_type": "propose_fix", "argument": "--- a/file.py\\n+++ b/file.py\\n@@ ... @@\\n-old\\n+new"}
|
| 75 |
+
|
| 76 |
+
Rules:
|
| 77 |
+
1. Read the test file first.
|
| 78 |
+
2. Search for flaky signals: random, time, sleep, shared state, env vars.
|
| 79 |
+
3. Run the test for non-order-dependent scenarios.
|
| 80 |
+
4. Call one terminal action when confident.
|
| 81 |
+
"""
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def _single_line(text: str) -> str:
|
| 85 |
+
return " ".join(str(text).split())
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def log_start(task: str, env_name: str, model: str) -> None:
|
| 89 |
+
print(f"[START] task={task} env={env_name} model={model}", flush=True)
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def log_step(step: int, action: str, reward: float, done: bool, error: str | None) -> None:
|
| 93 |
+
error_value = _single_line(error) if error else "null"
|
| 94 |
+
done_value = str(bool(done)).lower()
|
| 95 |
+
print(
|
| 96 |
+
f"[STEP] step={step} action={_single_line(action)} "
|
| 97 |
+
f"reward={reward:.2f} done={done_value} error={error_value}",
|
| 98 |
+
flush=True,
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def log_end(success: bool, steps: int, score: float, rewards: list[float]) -> None:
|
| 103 |
+
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 104 |
+
print(
|
| 105 |
+
f"[END] success={str(bool(success)).lower()} steps={steps} "
|
| 106 |
+
f"score={score:.2f} rewards={rewards_str}",
|
| 107 |
+
flush=True,
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def obs_to_prompt(obs: FlakySleuthObservation, max_steps: int) -> str:
|
| 112 |
+
tree_preview = "\n".join(obs.file_tree[:40])
|
| 113 |
+
return f"""TASK: {obs.task_description}
|
| 114 |
+
|
| 115 |
+
Repository: {obs.repo_url}
|
| 116 |
+
Test name: {obs.test_name}
|
| 117 |
+
Step: {obs.step_count}/{max_steps}
|
| 118 |
+
|
| 119 |
+
Test source code:
|
| 120 |
+
```python
|
| 121 |
+
{obs.test_code}
|
| 122 |
+
```
|
| 123 |
+
|
| 124 |
+
Repository file tree:
|
| 125 |
+
{tree_preview}
|
| 126 |
+
|
| 127 |
+
Last tool output:
|
| 128 |
+
{obs.tool_output or '(No action taken yet)'}
|
| 129 |
+
|
| 130 |
+
Return only JSON action."""
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def heuristic_action(obs: FlakySleuthObservation) -> FlakySleuthAction:
|
| 134 |
+
if obs.step_count == 0 and obs.file_tree:
|
| 135 |
+
return FlakySleuthAction(action_type="read_file", argument=obs.file_tree[0])
|
| 136 |
+
|
| 137 |
+
if obs.step_count < 2:
|
| 138 |
+
return FlakySleuthAction(action_type="search_code", argument="random")
|
| 139 |
+
|
| 140 |
+
if obs.task_type == "classify":
|
| 141 |
+
return FlakySleuthAction(action_type="classify_flakiness", argument="flaky")
|
| 142 |
+
if obs.task_type == "root_cause":
|
| 143 |
+
return FlakySleuthAction(action_type="classify_root_cause", argument="NOD")
|
| 144 |
+
return FlakySleuthAction(
|
| 145 |
+
action_type="propose_fix",
|
| 146 |
+
argument=(
|
| 147 |
+
"--- a/src/math_utils.py\n"
|
| 148 |
+
"+++ b/src/math_utils.py\n"
|
| 149 |
+
"@@\n"
|
| 150 |
+
"-def unstable_sum(values):\n"
|
| 151 |
+
"- random.shuffle(values)\n"
|
| 152 |
+
"- return values[0] + values[1]\n"
|
| 153 |
+
"+def unstable_sum(values):\n"
|
| 154 |
+
"+ ordered = sorted(values)\n"
|
| 155 |
+
"+ return ordered[0] + ordered[1]\n"
|
| 156 |
+
),
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def llm_action(messages: list[dict[str, str]]) -> FlakySleuthAction | None:
|
| 161 |
+
if not API_KEY:
|
| 162 |
+
return None
|
| 163 |
+
|
| 164 |
+
response = client.chat.completions.create(
|
| 165 |
+
model=MODEL_NAME,
|
| 166 |
+
messages=messages,
|
| 167 |
+
max_tokens=400,
|
| 168 |
+
temperature=0.0,
|
| 169 |
+
)
|
| 170 |
+
raw = (response.choices[0].message.content or "").strip()
|
| 171 |
+
cleaned = raw.replace("```json", "").replace("```", "").strip()
|
| 172 |
+
payload = json.loads(cleaned)
|
| 173 |
+
return FlakySleuthAction.model_validate(payload)
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
def run_episode(
|
| 177 |
+
env: FlakySleuthEnv,
|
| 178 |
+
*,
|
| 179 |
+
task_name: str,
|
| 180 |
+
benchmark_name: str,
|
| 181 |
+
max_steps: int,
|
| 182 |
+
) -> float:
|
| 183 |
+
rewards: list[float] = []
|
| 184 |
+
steps_taken = 0
|
| 185 |
+
score = 0.0
|
| 186 |
+
success = False
|
| 187 |
+
|
| 188 |
+
log_start(task=task_name, env_name=benchmark_name, model=MODEL_NAME)
|
| 189 |
+
|
| 190 |
+
try:
|
| 191 |
+
obs = env.reset()
|
| 192 |
+
messages: list[dict[str, str]] = [
|
| 193 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 194 |
+
{"role": "user", "content": obs_to_prompt(obs, max_steps)},
|
| 195 |
+
]
|
| 196 |
+
|
| 197 |
+
for step_idx in range(1, max_steps + 1):
|
| 198 |
+
try:
|
| 199 |
+
action = llm_action(messages) or heuristic_action(obs)
|
| 200 |
+
except Exception:
|
| 201 |
+
action = heuristic_action(obs)
|
| 202 |
+
|
| 203 |
+
obs, reward, done, info = env.step(action)
|
| 204 |
+
rewards.append(float(reward or 0.0))
|
| 205 |
+
steps_taken = step_idx
|
| 206 |
+
|
| 207 |
+
step_error: str | None = None
|
| 208 |
+
if isinstance(info, dict):
|
| 209 |
+
last_action_error = info.get("last_action_error")
|
| 210 |
+
if last_action_error:
|
| 211 |
+
step_error = str(last_action_error)
|
| 212 |
+
|
| 213 |
+
log_step(
|
| 214 |
+
step=step_idx,
|
| 215 |
+
action=action.model_dump_json(),
|
| 216 |
+
reward=float(reward or 0.0),
|
| 217 |
+
done=bool(done),
|
| 218 |
+
error=step_error,
|
| 219 |
+
)
|
| 220 |
+
|
| 221 |
+
if done:
|
| 222 |
+
score = float(reward or 0.0)
|
| 223 |
+
break
|
| 224 |
+
|
| 225 |
+
messages.append({"role": "assistant", "content": action.model_dump_json()})
|
| 226 |
+
messages.append({"role": "user", "content": obs_to_prompt(obs, max_steps)})
|
| 227 |
+
|
| 228 |
+
score = min(max(score, 0.0), 1.0)
|
| 229 |
+
success = score > 0.0
|
| 230 |
+
except Exception:
|
| 231 |
+
score = 0.0
|
| 232 |
+
success = False
|
| 233 |
+
finally:
|
| 234 |
+
try:
|
| 235 |
+
env.close()
|
| 236 |
+
except Exception:
|
| 237 |
+
pass
|
| 238 |
+
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
|
| 239 |
+
|
| 240 |
+
return score
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
def _parse_args() -> argparse.Namespace:
|
| 244 |
+
parser = argparse.ArgumentParser(description="Run FlakySleuth compliance inference.")
|
| 245 |
+
parser.add_argument(
|
| 246 |
+
"--dataset-path",
|
| 247 |
+
default="dataset/py_tasks.csv",
|
| 248 |
+
help="Processed task CSV used by the environment.",
|
| 249 |
+
)
|
| 250 |
+
parser.add_argument(
|
| 251 |
+
"--episodes-per-task",
|
| 252 |
+
type=int,
|
| 253 |
+
default=EPISODES_PER_TASK,
|
| 254 |
+
help="Episodes per task type.",
|
| 255 |
+
)
|
| 256 |
+
parser.add_argument(
|
| 257 |
+
"--task-types",
|
| 258 |
+
default="classify,root_cause,fix_proposal",
|
| 259 |
+
help="Comma-separated task types to run (classify,root_cause,fix_proposal).",
|
| 260 |
+
)
|
| 261 |
+
parser.add_argument(
|
| 262 |
+
"--max-steps",
|
| 263 |
+
type=int,
|
| 264 |
+
default=MAX_STEPS,
|
| 265 |
+
help="Max steps per episode.",
|
| 266 |
+
)
|
| 267 |
+
parser.add_argument(
|
| 268 |
+
"--benchmark-name",
|
| 269 |
+
default=BENCHMARK_NAME,
|
| 270 |
+
help="Benchmark label for [START] lines.",
|
| 271 |
+
)
|
| 272 |
+
return parser.parse_args()
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
def main() -> None:
|
| 276 |
+
args = _parse_args()
|
| 277 |
+
env = FlakySleuthEnv(dataset_path=args.dataset_path, max_steps=args.max_steps)
|
| 278 |
+
|
| 279 |
+
allowed_task_types = {"classify", "root_cause", "fix_proposal"}
|
| 280 |
+
task_types = [t.strip() for t in args.task_types.split(",") if t.strip()]
|
| 281 |
+
if not task_types:
|
| 282 |
+
return
|
| 283 |
+
|
| 284 |
+
for task_type in task_types:
|
| 285 |
+
if task_type not in allowed_task_types:
|
| 286 |
+
continue
|
| 287 |
+
env.loader.force_task_type(task_type)
|
| 288 |
+
for _ in range(args.episodes_per_task):
|
| 289 |
+
run_episode(
|
| 290 |
+
env,
|
| 291 |
+
task_name=task_type,
|
| 292 |
+
benchmark_name=args.benchmark_name,
|
| 293 |
+
max_steps=args.max_steps,
|
| 294 |
+
)
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
if __name__ == "__main__":
|
| 298 |
+
main()
|
inference_compliance.py
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Inference Script Example
|
| 3 |
+
===================================
|
| 4 |
+
MANDATORY
|
| 5 |
+
- Before submitting, ensure the following variables are defined in your environment configuration:
|
| 6 |
+
API_BASE_URL The API endpoint for the LLM.
|
| 7 |
+
MODEL_NAME The model identifier to use for inference.
|
| 8 |
+
HF_TOKEN Your Hugging Face / API key.
|
| 9 |
+
LOCAL_IMAGE_NAME The name of the local image to use for the environment if you are using from_docker_image()
|
| 10 |
+
method
|
| 11 |
+
|
| 12 |
+
- Defaults are set only for API_BASE_URL and MODEL_NAME
|
| 13 |
+
(and should reflect your active inference setup):
|
| 14 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "<your-active-endpoint>")
|
| 15 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "<your-active-model>")
|
| 16 |
+
|
| 17 |
+
- The inference script must be named `inference.py` and placed in the root directory of the project
|
| 18 |
+
- Participants must use OpenAI Client for all LLM calls using above variables
|
| 19 |
+
|
| 20 |
+
STDOUT FORMAT
|
| 21 |
+
- The script must emit exactly three line types to stdout, in this order:
|
| 22 |
+
|
| 23 |
+
[START] task=<task_name> env=<benchmark> model=<model_name>
|
| 24 |
+
[STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
|
| 25 |
+
[END] success=<true|false> steps=<n> score=<score> rewards=<r1,r2,...,rn>
|
| 26 |
+
|
| 27 |
+
Rules:
|
| 28 |
+
- One [START] line at episode begin.
|
| 29 |
+
- One [STEP] line per step, immediately after env.step() returns.
|
| 30 |
+
- One [END] line after env.close(), always emitted (even on exception).
|
| 31 |
+
- reward and rewards are formatted to 2 decimal places.
|
| 32 |
+
- done and success are lowercase booleans: true or false.
|
| 33 |
+
- error is the raw last_action_error string, or null if none.
|
| 34 |
+
- All fields on a single line with no newlines within a line.
|
| 35 |
+
- Each tasks should return score in [0, 1]
|
| 36 |
+
|
| 37 |
+
Example:
|
| 38 |
+
[START] task=click-test env=miniwob model=Qwen3-VL-30B
|
| 39 |
+
[STEP] step=1 action=click('123') reward=0.00 done=false error=null
|
| 40 |
+
[STEP] step=2 action=fill('456','text') reward=0.00 done=false error=null
|
| 41 |
+
[STEP] step=3 action=click('789') reward=1.00 done=true error=null
|
| 42 |
+
[END] success=true steps=3 score=1.00 rewards=0.00,0.00,1.00
|
| 43 |
+
"""
|
| 44 |
+
|
| 45 |
+
import asyncio
|
| 46 |
+
import os
|
| 47 |
+
import textwrap
|
| 48 |
+
from typing import List, Optional
|
| 49 |
+
|
| 50 |
+
from openai import OpenAI
|
| 51 |
+
|
| 52 |
+
from my_env_v4 import MyEnvV4Action, MyEnvV4Env
|
| 53 |
+
IMAGE_NAME = os.getenv("IMAGE_NAME") # If you are using docker image
|
| 54 |
+
API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 55 |
+
|
| 56 |
+
API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
|
| 57 |
+
MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
|
| 58 |
+
TASK_NAME = os.getenv("MY_ENV_V4_TASK", "echo")
|
| 59 |
+
BENCHMARK = os.getenv("MY_ENV_V4_BENCHMARK", "my_env_v4")
|
| 60 |
+
MAX_STEPS = 8
|
| 61 |
+
TEMPERATURE = 0.7
|
| 62 |
+
MAX_TOKENS = 150
|
| 63 |
+
SUCCESS_SCORE_THRESHOLD = 0.1 # normalized score in [0, 1]
|
| 64 |
+
|
| 65 |
+
# Max possible reward: each token contributes 0.1, across all steps
|
| 66 |
+
_MAX_REWARD_PER_STEP = MAX_TOKENS * 0.1
|
| 67 |
+
MAX_TOTAL_REWARD = MAX_STEPS * _MAX_REWARD_PER_STEP
|
| 68 |
+
|
| 69 |
+
SYSTEM_PROMPT = textwrap.dedent(
|
| 70 |
+
"""
|
| 71 |
+
You are interacting with a simple echo environment.
|
| 72 |
+
Each turn you must send a message. The environment will echo it back.
|
| 73 |
+
Reward is proportional to message length: reward = len(message) * 0.1
|
| 74 |
+
Your goal is to maximize total reward by sending meaningful, substantive messages.
|
| 75 |
+
Reply with exactly one message string — no quotes, no prefixes, just the message text.
|
| 76 |
+
"""
|
| 77 |
+
).strip()
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def log_start(task: str, env: str, model: str) -> None:
|
| 81 |
+
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
|
| 85 |
+
error_val = error if error else "null"
|
| 86 |
+
done_val = str(done).lower()
|
| 87 |
+
print(
|
| 88 |
+
f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
|
| 89 |
+
flush=True,
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
|
| 94 |
+
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 95 |
+
print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def build_user_prompt(step: int, last_echoed: str, last_reward: float, history: List[str]) -> str:
|
| 99 |
+
history_block = "\n".join(history[-4:]) if history else "None"
|
| 100 |
+
return textwrap.dedent(
|
| 101 |
+
f"""
|
| 102 |
+
Step: {step}
|
| 103 |
+
Last echoed message: {last_echoed!r}
|
| 104 |
+
Last reward: {last_reward:.2f}
|
| 105 |
+
Previous steps:
|
| 106 |
+
{history_block}
|
| 107 |
+
Send your next message.
|
| 108 |
+
"""
|
| 109 |
+
).strip()
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def get_model_message(client: OpenAI, step: int, last_echoed: str, last_reward: float, history: List[str]) -> str:
|
| 113 |
+
user_prompt = build_user_prompt(step, last_echoed, last_reward, history)
|
| 114 |
+
try:
|
| 115 |
+
completion = client.chat.completions.create(
|
| 116 |
+
model=MODEL_NAME,
|
| 117 |
+
messages=[
|
| 118 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 119 |
+
{"role": "user", "content": user_prompt},
|
| 120 |
+
],
|
| 121 |
+
temperature=TEMPERATURE,
|
| 122 |
+
max_tokens=MAX_TOKENS,
|
| 123 |
+
stream=False,
|
| 124 |
+
)
|
| 125 |
+
text = (completion.choices[0].message.content or "").strip()
|
| 126 |
+
return text if text else "hello"
|
| 127 |
+
except Exception as exc:
|
| 128 |
+
print(f"[DEBUG] Model request failed: {exc}", flush=True)
|
| 129 |
+
return "hello"
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
async def main() -> None:
|
| 133 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
|
| 134 |
+
|
| 135 |
+
env = await MyEnvV4Env.from_docker_image(IMAGE_NAME)
|
| 136 |
+
|
| 137 |
+
history: List[str] = []
|
| 138 |
+
rewards: List[float] = []
|
| 139 |
+
steps_taken = 0
|
| 140 |
+
score = 0.0
|
| 141 |
+
success = False
|
| 142 |
+
|
| 143 |
+
log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
|
| 144 |
+
|
| 145 |
+
try:
|
| 146 |
+
result = await env.reset() # OpenENV.reset()
|
| 147 |
+
last_echoed = result.observation.echoed_message
|
| 148 |
+
last_reward = 0.0
|
| 149 |
+
|
| 150 |
+
for step in range(1, MAX_STEPS + 1):
|
| 151 |
+
if result.done:
|
| 152 |
+
break
|
| 153 |
+
|
| 154 |
+
message = get_model_message(client, step, last_echoed, last_reward, history)
|
| 155 |
+
|
| 156 |
+
result = await env.step(MyEnvV4Action(message=message))
|
| 157 |
+
obs = result.observation
|
| 158 |
+
|
| 159 |
+
reward = result.reward or 0.0
|
| 160 |
+
done = result.done
|
| 161 |
+
error = None
|
| 162 |
+
|
| 163 |
+
rewards.append(reward)
|
| 164 |
+
steps_taken = step
|
| 165 |
+
last_echoed = obs.echoed_message
|
| 166 |
+
last_reward = reward
|
| 167 |
+
|
| 168 |
+
log_step(step=step, action=message, reward=reward, done=done, error=error)
|
| 169 |
+
|
| 170 |
+
history.append(f"Step {step}: {message!r} -> reward {reward:+.2f}")
|
| 171 |
+
|
| 172 |
+
if done:
|
| 173 |
+
break
|
| 174 |
+
|
| 175 |
+
score = sum(rewards) / MAX_TOTAL_REWARD if MAX_TOTAL_REWARD > 0 else 0.0
|
| 176 |
+
score = min(max(score, 0.0), 1.0) # clamp to [0, 1]
|
| 177 |
+
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 178 |
+
|
| 179 |
+
finally:
|
| 180 |
+
try:
|
| 181 |
+
await env.close()
|
| 182 |
+
except Exception as e:
|
| 183 |
+
print(f"[DEBUG] env.close() error (container cleanup): {e}", flush=True)
|
| 184 |
+
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
if __name__ == "__main__":
|
| 188 |
+
asyncio.run(main())
|
inference_debug.py
ADDED
|
@@ -0,0 +1,606 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""FlakySleuth baseline inference script.
|
| 2 |
+
|
| 3 |
+
Environment variables:
|
| 4 |
+
Preferred:
|
| 5 |
+
HF_TOKEN / HUGGINGFACE_HUB_TOKEN (or OPENROUTER_API_KEY / API_KEY)
|
| 6 |
+
API_BASE_URL (optional, defaults to https://openrouter.ai/api/v1 for router-style keys)
|
| 7 |
+
MODEL_NAME (optional, defaults to qwen/qwen3.6-plus:free on OpenRouter)
|
| 8 |
+
|
| 9 |
+
Optional fallback:
|
| 10 |
+
OPENAI_API_KEY
|
| 11 |
+
API_BASE_URL (defaults to https://api.openai.com/v1 when OpenAI key is used)
|
| 12 |
+
MODEL_NAME (defaults to gpt-4o-mini for OpenAI)
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from __future__ import annotations
|
| 16 |
+
|
| 17 |
+
import json
|
| 18 |
+
import os
|
| 19 |
+
import argparse
|
| 20 |
+
import time
|
| 21 |
+
from collections import defaultdict
|
| 22 |
+
from pathlib import Path
|
| 23 |
+
from typing import Any
|
| 24 |
+
|
| 25 |
+
from openai import OpenAI
|
| 26 |
+
|
| 27 |
+
try:
|
| 28 |
+
from tqdm import tqdm
|
| 29 |
+
except Exception: # pragma: no cover
|
| 30 |
+
tqdm = None
|
| 31 |
+
|
| 32 |
+
from env.environment import FlakySleuthEnv
|
| 33 |
+
from env.models import FlakySleuthAction, FlakySleuthObservation
|
| 34 |
+
|
| 35 |
+
OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY")
|
| 36 |
+
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
|
| 37 |
+
HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN")
|
| 38 |
+
RAW_API_KEY = os.environ.get("API_KEY")
|
| 39 |
+
API_KEY = RAW_API_KEY or OPENROUTER_API_KEY or OPENAI_API_KEY or HF_TOKEN or ""
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def _looks_like_openrouter_key(key: str | None) -> bool:
|
| 43 |
+
return bool(key and key.startswith("sk-or-"))
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
DEFAULT_BASE_URL = (
|
| 47 |
+
"https://router.huggingface.co/v1"
|
| 48 |
+
if (
|
| 49 |
+
HF_TOKEN
|
| 50 |
+
and not RAW_API_KEY
|
| 51 |
+
and not OPENROUTER_API_KEY
|
| 52 |
+
and not OPENAI_API_KEY
|
| 53 |
+
)
|
| 54 |
+
else (
|
| 55 |
+
"https://openrouter.ai/api/v1"
|
| 56 |
+
if (
|
| 57 |
+
(OPENROUTER_API_KEY and not RAW_API_KEY and not OPENAI_API_KEY)
|
| 58 |
+
or (_looks_like_openrouter_key(RAW_API_KEY) and not OPENAI_API_KEY)
|
| 59 |
+
)
|
| 60 |
+
else "https://api.openai.com/v1"
|
| 61 |
+
)
|
| 62 |
+
)
|
| 63 |
+
API_BASE_URL = os.environ.get("API_BASE_URL", DEFAULT_BASE_URL)
|
| 64 |
+
|
| 65 |
+
DEFAULT_MODEL = (
|
| 66 |
+
"openai/gpt-oss-120b:novita"
|
| 67 |
+
if API_BASE_URL.startswith("https://router.huggingface.co")
|
| 68 |
+
else (
|
| 69 |
+
"qwen/qwen3.6-plus:free"
|
| 70 |
+
if API_BASE_URL.startswith("https://openrouter.ai")
|
| 71 |
+
else "gpt-4o-mini"
|
| 72 |
+
)
|
| 73 |
+
)
|
| 74 |
+
MODEL_NAME = os.environ.get("MODEL_NAME", DEFAULT_MODEL)
|
| 75 |
+
# Keep a conservative default to stay under common hackathon runtime limits.
|
| 76 |
+
EPISODES_PER_TASK = 2
|
| 77 |
+
MAX_STEPS = 20
|
| 78 |
+
|
| 79 |
+
client = OpenAI(api_key=API_KEY, base_url=API_BASE_URL)
|
| 80 |
+
|
| 81 |
+
SYSTEM_PROMPT = """You are a flaky test detective.
|
| 82 |
+
|
| 83 |
+
Respond ONLY with a single valid JSON object.
|
| 84 |
+
|
| 85 |
+
Exploration actions:
|
| 86 |
+
{"action_type": "read_file", "argument": "relative/path.py"}
|
| 87 |
+
{"action_type": "search_code", "argument": "pattern"}
|
| 88 |
+
{"action_type": "run_test", "argument": ""}
|
| 89 |
+
|
| 90 |
+
Terminal actions:
|
| 91 |
+
{"action_type": "classify_flakiness", "argument": "flaky"}
|
| 92 |
+
{"action_type": "classify_flakiness", "argument": "stable"}
|
| 93 |
+
{"action_type": "classify_root_cause", "argument": "OD"}
|
| 94 |
+
{"action_type": "classify_root_cause", "argument": "OD-Brit"}
|
| 95 |
+
{"action_type": "classify_root_cause", "argument": "OD-Vic"}
|
| 96 |
+
{"action_type": "classify_root_cause", "argument": "NIO"}
|
| 97 |
+
{"action_type": "classify_root_cause", "argument": "NOD"}
|
| 98 |
+
{"action_type": "classify_root_cause", "argument": "TD"}
|
| 99 |
+
{"action_type": "classify_root_cause", "argument": "TZD"}
|
| 100 |
+
{"action_type": "classify_root_cause", "argument": "ID"}
|
| 101 |
+
{"action_type": "propose_fix", "argument": "--- a/file.py\\n+++ b/file.py\\n@@ ... @@\\n-old\\n+new"}
|
| 102 |
+
|
| 103 |
+
Rules:
|
| 104 |
+
1. Read the test file first.
|
| 105 |
+
2. Search for flaky signals: random, time, sleep, shared state, env vars.
|
| 106 |
+
3. Run the test for non-order-dependent scenarios.
|
| 107 |
+
4. Call one terminal action when confident.
|
| 108 |
+
"""
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def _to_single_line(text: str) -> str:
|
| 112 |
+
return " ".join(str(text).split())
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def _compliance_log_start(task: str, benchmark: str, model: str) -> None:
|
| 116 |
+
print(f"[START] task={task} env={benchmark} model={model}", flush=True)
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def _compliance_log_step(
|
| 120 |
+
step: int,
|
| 121 |
+
action: str,
|
| 122 |
+
reward: float,
|
| 123 |
+
done: bool,
|
| 124 |
+
error: str | None,
|
| 125 |
+
) -> None:
|
| 126 |
+
error_value = _to_single_line(error) if error else "null"
|
| 127 |
+
print(
|
| 128 |
+
f"[STEP] step={step} action={_to_single_line(action)} "
|
| 129 |
+
f"reward={reward:.2f} done={str(bool(done)).lower()} error={error_value}",
|
| 130 |
+
flush=True,
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def _compliance_log_end(success: bool, steps: int, score: float, rewards: list[float]) -> None:
|
| 135 |
+
rewards_value = ",".join(f"{r:.2f}" for r in rewards)
|
| 136 |
+
print(
|
| 137 |
+
f"[END] success={str(bool(success)).lower()} steps={steps} "
|
| 138 |
+
f"score={score:.2f} rewards={rewards_value}",
|
| 139 |
+
flush=True,
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def obs_to_prompt(obs: FlakySleuthObservation) -> str:
|
| 144 |
+
tree_preview = "\n".join(obs.file_tree[:40])
|
| 145 |
+
return f"""TASK: {obs.task_description}
|
| 146 |
+
|
| 147 |
+
Repository: {obs.repo_url}
|
| 148 |
+
Test name: {obs.test_name}
|
| 149 |
+
Step: {obs.step_count}/{MAX_STEPS}
|
| 150 |
+
|
| 151 |
+
Test source code:
|
| 152 |
+
```python
|
| 153 |
+
{obs.test_code}
|
| 154 |
+
```
|
| 155 |
+
|
| 156 |
+
Repository file tree:
|
| 157 |
+
{tree_preview}
|
| 158 |
+
|
| 159 |
+
Last tool output:
|
| 160 |
+
{obs.tool_output or "(No action taken yet)"}
|
| 161 |
+
|
| 162 |
+
Return only JSON action."""
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def heuristic_action(obs: FlakySleuthObservation) -> FlakySleuthAction:
|
| 166 |
+
if obs.step_count == 0 and obs.file_tree:
|
| 167 |
+
return FlakySleuthAction(action_type="read_file", argument=obs.file_tree[0])
|
| 168 |
+
|
| 169 |
+
if obs.step_count < 2:
|
| 170 |
+
return FlakySleuthAction(action_type="search_code", argument="random")
|
| 171 |
+
|
| 172 |
+
if obs.task_type == "classify":
|
| 173 |
+
return FlakySleuthAction(action_type="classify_flakiness", argument="flaky")
|
| 174 |
+
if obs.task_type == "root_cause":
|
| 175 |
+
return FlakySleuthAction(action_type="classify_root_cause", argument="NOD")
|
| 176 |
+
return FlakySleuthAction(
|
| 177 |
+
action_type="propose_fix",
|
| 178 |
+
argument=(
|
| 179 |
+
"--- a/src/math_utils.py\n"
|
| 180 |
+
"+++ b/src/math_utils.py\n"
|
| 181 |
+
"@@\n"
|
| 182 |
+
"-def unstable_sum(values):\n"
|
| 183 |
+
"- random.shuffle(values)\n"
|
| 184 |
+
"- return values[0] + values[1]\n"
|
| 185 |
+
"+def unstable_sum(values):\n"
|
| 186 |
+
"+ ordered = sorted(values)\n"
|
| 187 |
+
"+ return ordered[0] + ordered[1]\n"
|
| 188 |
+
),
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
def llm_action(
|
| 193 |
+
messages: list[dict[str, str]],
|
| 194 |
+
) -> tuple[FlakySleuthAction | None, dict[str, Any]]:
|
| 195 |
+
meta: dict[str, Any] = {
|
| 196 |
+
"attempted": False,
|
| 197 |
+
"raw_output": "",
|
| 198 |
+
"error": "",
|
| 199 |
+
}
|
| 200 |
+
if not API_KEY:
|
| 201 |
+
return None, meta
|
| 202 |
+
|
| 203 |
+
meta["attempted"] = True
|
| 204 |
+
response = client.chat.completions.create(
|
| 205 |
+
model=MODEL_NAME,
|
| 206 |
+
messages=messages,
|
| 207 |
+
max_tokens=400,
|
| 208 |
+
temperature=0.0,
|
| 209 |
+
)
|
| 210 |
+
raw = (response.choices[0].message.content or "").strip()
|
| 211 |
+
meta["raw_output"] = raw
|
| 212 |
+
cleaned = raw.replace("```json", "").replace("```", "").strip()
|
| 213 |
+
payload = json.loads(cleaned)
|
| 214 |
+
return FlakySleuthAction.model_validate(payload), meta
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
def _clip_text(text: str, max_chars: int) -> str:
|
| 218 |
+
if max_chars <= 0:
|
| 219 |
+
return text
|
| 220 |
+
if len(text) <= max_chars:
|
| 221 |
+
return text
|
| 222 |
+
remaining = len(text) - max_chars
|
| 223 |
+
return f"{text[:max_chars]}\n...[truncated {remaining} chars]"
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
def _trace_print(
|
| 227 |
+
enabled: bool,
|
| 228 |
+
message: str,
|
| 229 |
+
*,
|
| 230 |
+
text: str | None = None,
|
| 231 |
+
max_chars: int = 0,
|
| 232 |
+
) -> None:
|
| 233 |
+
if not enabled:
|
| 234 |
+
return
|
| 235 |
+
print(message)
|
| 236 |
+
if text is not None:
|
| 237 |
+
print(_clip_text(text, max_chars))
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
def _format_duration(seconds: float) -> str:
|
| 241 |
+
seconds = max(0.0, float(seconds))
|
| 242 |
+
mins, secs = divmod(int(round(seconds)), 60)
|
| 243 |
+
hrs, mins = divmod(mins, 60)
|
| 244 |
+
if hrs > 0:
|
| 245 |
+
return f"{hrs:d}h {mins:02d}m {secs:02d}s"
|
| 246 |
+
return f"{mins:02d}m {secs:02d}s"
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
def run_episode(
|
| 250 |
+
env: FlakySleuthEnv,
|
| 251 |
+
*,
|
| 252 |
+
print_terminal: bool = True,
|
| 253 |
+
trace_agent: bool = False,
|
| 254 |
+
trace_prompts: bool = False,
|
| 255 |
+
trace_max_chars: int = 2000,
|
| 256 |
+
episode_label: str = "",
|
| 257 |
+
compliance_stdout: bool = False,
|
| 258 |
+
benchmark_name: str = "flakysleuth",
|
| 259 |
+
compliance_task_name: str | None = None,
|
| 260 |
+
) -> tuple[float, dict[str, Any]]:
|
| 261 |
+
rewards: list[float] = []
|
| 262 |
+
steps_taken = 0
|
| 263 |
+
success = False
|
| 264 |
+
episode_task_name = (compliance_task_name or episode_label.split(" ", 1)[0].strip() or "unknown")
|
| 265 |
+
exploration_reward_total = 0.0
|
| 266 |
+
final_episode_score = 0.0
|
| 267 |
+
terminal_meta: dict[str, Any] = {}
|
| 268 |
+
if compliance_stdout:
|
| 269 |
+
_compliance_log_start(episode_task_name, benchmark_name, MODEL_NAME)
|
| 270 |
+
try:
|
| 271 |
+
obs = env.reset()
|
| 272 |
+
|
| 273 |
+
initial_prompt = obs_to_prompt(obs)
|
| 274 |
+
messages = [
|
| 275 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 276 |
+
{"role": "user", "content": initial_prompt},
|
| 277 |
+
]
|
| 278 |
+
|
| 279 |
+
if not compliance_stdout:
|
| 280 |
+
_trace_print(
|
| 281 |
+
trace_agent,
|
| 282 |
+
(
|
| 283 |
+
f"\n[trace] {episode_label} "
|
| 284 |
+
f"task={obs.task_type} repo={obs.repo_url} test={obs.test_name}"
|
| 285 |
+
).strip(),
|
| 286 |
+
)
|
| 287 |
+
if trace_prompts and not compliance_stdout:
|
| 288 |
+
_trace_print(
|
| 289 |
+
trace_agent,
|
| 290 |
+
"[trace] system prompt:",
|
| 291 |
+
text=SYSTEM_PROMPT,
|
| 292 |
+
max_chars=trace_max_chars,
|
| 293 |
+
)
|
| 294 |
+
_trace_print(
|
| 295 |
+
trace_agent,
|
| 296 |
+
"[trace] initial user prompt:",
|
| 297 |
+
text=initial_prompt,
|
| 298 |
+
max_chars=trace_max_chars,
|
| 299 |
+
)
|
| 300 |
+
|
| 301 |
+
for step_idx in range(MAX_STEPS):
|
| 302 |
+
action: FlakySleuthAction
|
| 303 |
+
action_source = "heuristic"
|
| 304 |
+
llm_meta: dict[str, Any] = {"attempted": False, "raw_output": "", "error": ""}
|
| 305 |
+
try:
|
| 306 |
+
candidate, llm_meta = llm_action(messages)
|
| 307 |
+
if candidate is not None:
|
| 308 |
+
action = candidate
|
| 309 |
+
action_source = "llm"
|
| 310 |
+
else:
|
| 311 |
+
action = heuristic_action(obs)
|
| 312 |
+
if llm_meta.get("attempted"):
|
| 313 |
+
llm_meta["error"] = (
|
| 314 |
+
"Model response unavailable, using heuristic fallback."
|
| 315 |
+
)
|
| 316 |
+
except Exception as exc:
|
| 317 |
+
llm_meta["error"] = str(exc)
|
| 318 |
+
action = heuristic_action(obs)
|
| 319 |
+
|
| 320 |
+
if trace_agent and not compliance_stdout:
|
| 321 |
+
print(f"[trace] step={step_idx + 1} action_source={action_source}")
|
| 322 |
+
if llm_meta.get("attempted"):
|
| 323 |
+
_trace_print(
|
| 324 |
+
True,
|
| 325 |
+
"[trace] raw model output:",
|
| 326 |
+
text=str(llm_meta.get("raw_output", "")),
|
| 327 |
+
max_chars=trace_max_chars,
|
| 328 |
+
)
|
| 329 |
+
if llm_meta.get("error"):
|
| 330 |
+
print(f"[trace] llm_error={llm_meta['error']}")
|
| 331 |
+
print(f"[trace] action={action.model_dump_json()}")
|
| 332 |
+
|
| 333 |
+
obs, reward, done, info = env.step(action)
|
| 334 |
+
rewards.append(reward)
|
| 335 |
+
steps_taken = step_idx + 1
|
| 336 |
+
|
| 337 |
+
step_error: str | None = None
|
| 338 |
+
if isinstance(info, dict):
|
| 339 |
+
raw_err = info.get("last_action_error")
|
| 340 |
+
if raw_err:
|
| 341 |
+
step_error = str(raw_err)
|
| 342 |
+
if not step_error and obs.tool_output and str(obs.tool_output).startswith("ERROR:"):
|
| 343 |
+
step_error = str(obs.tool_output)
|
| 344 |
+
|
| 345 |
+
if compliance_stdout:
|
| 346 |
+
_compliance_log_step(
|
| 347 |
+
step=steps_taken,
|
| 348 |
+
action=action.model_dump_json(),
|
| 349 |
+
reward=reward,
|
| 350 |
+
done=done,
|
| 351 |
+
error=step_error,
|
| 352 |
+
)
|
| 353 |
+
|
| 354 |
+
if trace_agent and not compliance_stdout:
|
| 355 |
+
print(
|
| 356 |
+
f"[trace] step_result reward={reward:.3f} done={done} "
|
| 357 |
+
f"step_count={obs.step_count}"
|
| 358 |
+
)
|
| 359 |
+
if obs.tool_output:
|
| 360 |
+
_trace_print(
|
| 361 |
+
True,
|
| 362 |
+
"[trace] tool_output:",
|
| 363 |
+
text=obs.tool_output,
|
| 364 |
+
max_chars=trace_max_chars,
|
| 365 |
+
)
|
| 366 |
+
|
| 367 |
+
if done:
|
| 368 |
+
# Terminal reward already includes cumulative progress + terminal score.
|
| 369 |
+
final_episode_score = reward
|
| 370 |
+
terminal_meta = {
|
| 371 |
+
"action_type": action.action_type,
|
| 372 |
+
"terminal_score": float(info.get("terminal_score", 0) or 0),
|
| 373 |
+
"progress_score": float(info.get("progress_score", 0) or 0),
|
| 374 |
+
"explore_sum": exploration_reward_total,
|
| 375 |
+
"episode_score": final_episode_score,
|
| 376 |
+
}
|
| 377 |
+
success = final_episode_score > 0.0
|
| 378 |
+
if print_terminal:
|
| 379 |
+
print(
|
| 380 |
+
f" Terminal: {action.action_type}({action.argument[:40]}) "
|
| 381 |
+
f"-> terminal={info.get('terminal_score', 0):.2f} "
|
| 382 |
+
f"progress={info.get('progress_score', 0):.2f} "
|
| 383 |
+
f"explore_sum={exploration_reward_total:.3f} "
|
| 384 |
+
f"episode_score={final_episode_score:.3f}"
|
| 385 |
+
)
|
| 386 |
+
break
|
| 387 |
+
|
| 388 |
+
exploration_reward_total += reward
|
| 389 |
+
messages.append({"role": "assistant", "content": action.model_dump_json()})
|
| 390 |
+
next_prompt = obs_to_prompt(obs)
|
| 391 |
+
messages.append({"role": "user", "content": next_prompt})
|
| 392 |
+
if trace_agent and trace_prompts and not compliance_stdout:
|
| 393 |
+
_trace_print(
|
| 394 |
+
True,
|
| 395 |
+
f"[trace] next user prompt (step={step_idx + 1}):",
|
| 396 |
+
text=next_prompt,
|
| 397 |
+
max_chars=trace_max_chars,
|
| 398 |
+
)
|
| 399 |
+
except Exception as exc:
|
| 400 |
+
terminal_meta["error"] = str(exc)
|
| 401 |
+
success = False
|
| 402 |
+
if not compliance_stdout:
|
| 403 |
+
raise
|
| 404 |
+
finally:
|
| 405 |
+
if compliance_stdout:
|
| 406 |
+
try:
|
| 407 |
+
env.close()
|
| 408 |
+
except Exception:
|
| 409 |
+
pass
|
| 410 |
+
_compliance_log_end(
|
| 411 |
+
success=success,
|
| 412 |
+
steps=steps_taken,
|
| 413 |
+
score=min(max(final_episode_score, 0.0), 1.0),
|
| 414 |
+
rewards=rewards,
|
| 415 |
+
)
|
| 416 |
+
|
| 417 |
+
return final_episode_score, terminal_meta
|
| 418 |
+
|
| 419 |
+
|
| 420 |
+
def _looks_like_placeholder_dataset(dataset_path: str) -> bool:
|
| 421 |
+
path = Path(dataset_path)
|
| 422 |
+
if not path.exists():
|
| 423 |
+
return False
|
| 424 |
+
try:
|
| 425 |
+
text = path.read_text(encoding="utf-8", errors="replace")
|
| 426 |
+
except Exception:
|
| 427 |
+
return False
|
| 428 |
+
return "fixture://" in text
|
| 429 |
+
|
| 430 |
+
|
| 431 |
+
def _parse_args() -> argparse.Namespace:
|
| 432 |
+
parser = argparse.ArgumentParser(description="Run FlakySleuth baseline inference.")
|
| 433 |
+
parser.add_argument(
|
| 434 |
+
"--dataset-path",
|
| 435 |
+
default="dataset/py_tasks.csv",
|
| 436 |
+
help="Processed task CSV used by the environment.",
|
| 437 |
+
)
|
| 438 |
+
parser.add_argument(
|
| 439 |
+
"--episodes-per-task",
|
| 440 |
+
type=int,
|
| 441 |
+
default=EPISODES_PER_TASK,
|
| 442 |
+
help="Episodes per task type.",
|
| 443 |
+
)
|
| 444 |
+
parser.add_argument(
|
| 445 |
+
"--task-types",
|
| 446 |
+
default="classify,root_cause,fix_proposal",
|
| 447 |
+
help="Comma-separated task types to run (classify,root_cause,fix_proposal).",
|
| 448 |
+
)
|
| 449 |
+
parser.add_argument(
|
| 450 |
+
"--no-progress",
|
| 451 |
+
action="store_true",
|
| 452 |
+
help="Disable progress bars and print classic per-episode logs.",
|
| 453 |
+
)
|
| 454 |
+
parser.add_argument(
|
| 455 |
+
"--trace-agent",
|
| 456 |
+
action="store_true",
|
| 457 |
+
help=(
|
| 458 |
+
"Print detailed agent trace: model output, chosen action/tool call, and "
|
| 459 |
+
"step results for every episode."
|
| 460 |
+
),
|
| 461 |
+
)
|
| 462 |
+
parser.add_argument(
|
| 463 |
+
"--trace-prompts",
|
| 464 |
+
action="store_true",
|
| 465 |
+
help="When tracing, also print full prompts sent to the model.",
|
| 466 |
+
)
|
| 467 |
+
parser.add_argument(
|
| 468 |
+
"--trace-max-chars",
|
| 469 |
+
type=int,
|
| 470 |
+
default=2500,
|
| 471 |
+
help="Max chars per traced text block (prompt/model output/tool output).",
|
| 472 |
+
)
|
| 473 |
+
parser.add_argument(
|
| 474 |
+
"--compliance-stdout",
|
| 475 |
+
action="store_true",
|
| 476 |
+
help=(
|
| 477 |
+
"Emit strict compliance logs to stdout using only [START]/[STEP]/[END] lines "
|
| 478 |
+
"for each episode."
|
| 479 |
+
),
|
| 480 |
+
)
|
| 481 |
+
parser.add_argument(
|
| 482 |
+
"--benchmark-name",
|
| 483 |
+
default="flakysleuth",
|
| 484 |
+
help="Benchmark name used in [START] lines when --compliance-stdout is enabled.",
|
| 485 |
+
)
|
| 486 |
+
return parser.parse_args()
|
| 487 |
+
|
| 488 |
+
|
| 489 |
+
def main() -> None:
|
| 490 |
+
run_start = time.perf_counter()
|
| 491 |
+
args = _parse_args()
|
| 492 |
+
env = FlakySleuthEnv(dataset_path=args.dataset_path)
|
| 493 |
+
allowed_task_types = {"classify", "root_cause", "fix_proposal"}
|
| 494 |
+
task_types = [t.strip() for t in args.task_types.split(",") if t.strip()]
|
| 495 |
+
invalid = [t for t in task_types if t not in allowed_task_types]
|
| 496 |
+
if invalid:
|
| 497 |
+
raise ValueError(
|
| 498 |
+
f"Invalid task type(s): {invalid}. "
|
| 499 |
+
"Valid values: classify,root_cause,fix_proposal."
|
| 500 |
+
)
|
| 501 |
+
if not task_types:
|
| 502 |
+
raise ValueError(
|
| 503 |
+
"No task types selected. Pass --task-types with at least one value."
|
| 504 |
+
)
|
| 505 |
+
results: dict[str, list[float]] = defaultdict(list)
|
| 506 |
+
|
| 507 |
+
if _looks_like_placeholder_dataset(args.dataset_path) and not args.compliance_stdout:
|
| 508 |
+
print(
|
| 509 |
+
"[warning] dataset appears to contain fixture rows (fixture://...). "
|
| 510 |
+
"Build real dataset from py-data.csv for real evaluation."
|
| 511 |
+
)
|
| 512 |
+
|
| 513 |
+
use_progress = (tqdm is not None) and (not args.no_progress) and (not args.compliance_stdout)
|
| 514 |
+
if args.trace_agent and use_progress and not args.compliance_stdout:
|
| 515 |
+
print(
|
| 516 |
+
"[info] --trace-agent enabled, disabling progress bars for readable trace logs."
|
| 517 |
+
)
|
| 518 |
+
use_progress = False
|
| 519 |
+
overall_bar = None
|
| 520 |
+
if use_progress:
|
| 521 |
+
overall_bar = tqdm(
|
| 522 |
+
total=len(task_types) * args.episodes_per_task,
|
| 523 |
+
desc="All tasks",
|
| 524 |
+
unit="ep",
|
| 525 |
+
dynamic_ncols=True,
|
| 526 |
+
)
|
| 527 |
+
|
| 528 |
+
for task_type in task_types:
|
| 529 |
+
task_start = time.perf_counter()
|
| 530 |
+
if not args.compliance_stdout:
|
| 531 |
+
print(f"\n-- Task type: {task_type} --")
|
| 532 |
+
env.loader.force_task_type(task_type)
|
| 533 |
+
task_bar = None
|
| 534 |
+
if use_progress:
|
| 535 |
+
task_bar = tqdm(
|
| 536 |
+
total=args.episodes_per_task,
|
| 537 |
+
desc=f"{task_type}",
|
| 538 |
+
unit="ep",
|
| 539 |
+
leave=False,
|
| 540 |
+
dynamic_ncols=True,
|
| 541 |
+
)
|
| 542 |
+
for episode in range(args.episodes_per_task):
|
| 543 |
+
score, meta = run_episode(
|
| 544 |
+
env,
|
| 545 |
+
print_terminal=(not use_progress) and (not args.compliance_stdout),
|
| 546 |
+
trace_agent=args.trace_agent,
|
| 547 |
+
trace_prompts=args.trace_prompts,
|
| 548 |
+
trace_max_chars=args.trace_max_chars,
|
| 549 |
+
episode_label=f"{task_type} ep={episode + 1}/{args.episodes_per_task}",
|
| 550 |
+
compliance_stdout=args.compliance_stdout,
|
| 551 |
+
benchmark_name=args.benchmark_name,
|
| 552 |
+
compliance_task_name=task_type,
|
| 553 |
+
)
|
| 554 |
+
results[task_type].append(score)
|
| 555 |
+
if use_progress and task_bar is not None:
|
| 556 |
+
task_bar.update(1)
|
| 557 |
+
task_avg = sum(results[task_type]) / len(results[task_type])
|
| 558 |
+
task_bar.set_postfix(
|
| 559 |
+
score=f"{score:.3f}",
|
| 560 |
+
avg=f"{task_avg:.3f}",
|
| 561 |
+
term=f"{meta.get('terminal_score', 0):.2f}",
|
| 562 |
+
)
|
| 563 |
+
if overall_bar is not None:
|
| 564 |
+
overall_bar.update(1)
|
| 565 |
+
all_scores = [s for values in results.values() for s in values]
|
| 566 |
+
overall_avg = sum(all_scores) / len(all_scores)
|
| 567 |
+
overall_bar.set_postfix(task=task_type, avg=f"{overall_avg:.3f}")
|
| 568 |
+
elif not args.compliance_stdout:
|
| 569 |
+
print(f" Episode {episode + 1}: {score:.3f}")
|
| 570 |
+
if task_bar is not None:
|
| 571 |
+
task_bar.close()
|
| 572 |
+
task_elapsed = time.perf_counter() - task_start
|
| 573 |
+
if not args.compliance_stdout:
|
| 574 |
+
avg_task = sum(results[task_type]) / max(1, len(results[task_type]))
|
| 575 |
+
print(
|
| 576 |
+
f" [time] task={task_type} elapsed={_format_duration(task_elapsed)} "
|
| 577 |
+
f"avg_ep={task_elapsed / max(1, args.episodes_per_task):.2f}s "
|
| 578 |
+
f"avg_score={avg_task:.3f}"
|
| 579 |
+
)
|
| 580 |
+
|
| 581 |
+
if overall_bar is not None:
|
| 582 |
+
overall_bar.close()
|
| 583 |
+
|
| 584 |
+
if args.compliance_stdout:
|
| 585 |
+
return
|
| 586 |
+
|
| 587 |
+
total_elapsed = time.perf_counter() - run_start
|
| 588 |
+
print("\n== BASELINE RESULTS ==")
|
| 589 |
+
all_scores: list[float] = []
|
| 590 |
+
for task_type in task_types:
|
| 591 |
+
scores = results[task_type]
|
| 592 |
+
avg = sum(scores) / len(scores)
|
| 593 |
+
all_scores.extend(scores)
|
| 594 |
+
print(f" {task_type:12s} avg={avg:.3f} scores={[round(s, 3) for s in scores]}")
|
| 595 |
+
|
| 596 |
+
overall = sum(all_scores) / len(all_scores)
|
| 597 |
+
print(f" {'OVERALL':12s} avg={overall:.3f}")
|
| 598 |
+
print(
|
| 599 |
+
f" {'RUNTIME':12s} total={_format_duration(total_elapsed)} "
|
| 600 |
+
f"episodes={len(all_scores)} "
|
| 601 |
+
f"avg_ep={(total_elapsed / max(1, len(all_scores))):.2f}s"
|
| 602 |
+
)
|
| 603 |
+
|
| 604 |
+
|
| 605 |
+
if __name__ == "__main__":
|
| 606 |
+
main()
|
models.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from env.models import FlakySleuthAction, FlakySleuthObservation, FlakySleuthReward
|
| 2 |
+
|
| 3 |
+
__all__ = ["FlakySleuthAction", "FlakySleuthObservation", "FlakySleuthReward"]
|
openenv.yaml
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
spec_version: 1
|
| 2 |
+
name: flaky_sleuth
|
| 3 |
+
type: space
|
| 4 |
+
runtime: fastapi
|
| 5 |
+
app: server.app:app
|
| 6 |
+
port: 8000
|
| 7 |
+
|
| 8 |
+
version: 0.1.0
|
| 9 |
+
description: >
|
| 10 |
+
An RL environment where an LLM agent investigates flaky tests in Python repositories.
|
| 11 |
+
The agent uses tool-like actions to read files, search code, and run tests, then submits
|
| 12 |
+
a terminal verdict for classification, root-cause detection, or fix proposal.
|
| 13 |
+
|
| 14 |
+
action_type: FlakySleuthAction
|
| 15 |
+
observation_type: FlakySleuthObservation
|
| 16 |
+
reward_range: [0.0, 1.0]
|
| 17 |
+
episode_max_steps: 20
|
| 18 |
+
baseline_script: inference.py
|
| 19 |
+
|
| 20 |
+
tasks:
|
| 21 |
+
- id: task1_classify
|
| 22 |
+
name: Flaky vs Stable Classification
|
| 23 |
+
difficulty: easy
|
| 24 |
+
description: Classify the target test as flaky or stable.
|
| 25 |
+
- id: task2_root_cause
|
| 26 |
+
name: Root Cause Category Identification
|
| 27 |
+
difficulty: medium
|
| 28 |
+
description: Predict flaky-test root-cause category (OD, NOD, TD, TZD, NIO, ID, etc.).
|
| 29 |
+
- id: task3_fix_proposal
|
| 30 |
+
name: Fix Proposal
|
| 31 |
+
difficulty: hard
|
| 32 |
+
description: Propose a concrete fix as unified diff for a known flaky test.
|
| 33 |
+
|
| 34 |
+
infra:
|
| 35 |
+
vcpu: 2
|
| 36 |
+
memory_gb: 8
|
| 37 |
+
max_inference_minutes: 20
|
pyproject.toml
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=45", "wheel"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "openenv-flaky-sleuth"
|
| 7 |
+
version = "0.1.0"
|
| 8 |
+
description = "FlakySleuth OpenEnv environment for flaky test investigation"
|
| 9 |
+
requires-python = ">=3.10"
|
| 10 |
+
dependencies = [
|
| 11 |
+
"openenv-core[core]>=0.2.3",
|
| 12 |
+
"fastapi>=0.110.0",
|
| 13 |
+
"uvicorn>=0.27.0",
|
| 14 |
+
"pydantic>=2.0.0",
|
| 15 |
+
"openai>=1.0.0",
|
| 16 |
+
"pandas>=2.0.0",
|
| 17 |
+
"pytest>=7.0.0",
|
| 18 |
+
"pytest-timeout>=2.0.0",
|
| 19 |
+
"requests>=2.31.0",
|
| 20 |
+
"tqdm>=4.66.0",
|
| 21 |
+
]
|
| 22 |
+
|
| 23 |
+
[project.optional-dependencies]
|
| 24 |
+
dev = [
|
| 25 |
+
"pytest>=8.0.0",
|
| 26 |
+
"pytest-cov>=4.0.0",
|
| 27 |
+
]
|
| 28 |
+
|
| 29 |
+
[project.scripts]
|
| 30 |
+
server = "server.app:main"
|
| 31 |
+
|
| 32 |
+
[tool.setuptools]
|
| 33 |
+
include-package-data = true
|
| 34 |
+
packages = ["env", "graders", "server"]
|
requirements.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi>=0.110.0
|
| 2 |
+
uvicorn>=0.27.0
|
| 3 |
+
pydantic>=2.0.0
|
| 4 |
+
openai>=1.0.0
|
| 5 |
+
pandas>=2.0.0
|
| 6 |
+
pytest>=7.0.0
|
| 7 |
+
pytest-timeout>=2.0.0
|
| 8 |
+
requests>=2.31.0
|
| 9 |
+
tqdm>=4.66.0
|
| 10 |
+
openenv-core[core]>=0.2.3
|
server.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Compatibility entrypoint for running the API as `python server.py`."""
|
| 2 |
+
|
| 3 |
+
from server.app import app, main
|
| 4 |
+
|
| 5 |
+
__all__ = ["app", "main"]
|
| 6 |
+
|
| 7 |
+
if __name__ == "__main__":
|
| 8 |
+
main()
|
server/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from server.app import app
|
| 2 |
+
|
| 3 |
+
__all__ = ["app"]
|
server/app.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import Any
|
| 4 |
+
|
| 5 |
+
from fastapi import Body, FastAPI, HTTPException
|
| 6 |
+
from pydantic import BaseModel, ValidationError
|
| 7 |
+
|
| 8 |
+
from env.environment import FlakySleuthEnv
|
| 9 |
+
from env.models import FlakySleuthAction, FlakySleuthObservation
|
| 10 |
+
|
| 11 |
+
app = FastAPI(title="FlakySleuth Environment")
|
| 12 |
+
env = FlakySleuthEnv()
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class FlakySleuthState(BaseModel):
|
| 16 |
+
repo_url: str | None = None
|
| 17 |
+
test_name: str | None = None
|
| 18 |
+
task_type: str | None = None
|
| 19 |
+
step_count: int
|
| 20 |
+
files_read: list[str]
|
| 21 |
+
cumulative_progress: float
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@app.post("/reset")
|
| 25 |
+
def reset() -> dict[str, Any]:
|
| 26 |
+
observation = env.reset()
|
| 27 |
+
return {
|
| 28 |
+
"observation": observation.model_dump(),
|
| 29 |
+
"reward": None,
|
| 30 |
+
"done": False,
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
@app.post("/step")
|
| 35 |
+
def step(payload: dict[str, Any] = Body(...)) -> dict[str, Any]:
|
| 36 |
+
"""Accept either {'action': {...}} or direct action payload."""
|
| 37 |
+
try:
|
| 38 |
+
action_payload = payload.get("action", payload)
|
| 39 |
+
action = FlakySleuthAction.model_validate(action_payload)
|
| 40 |
+
except ValidationError as exc:
|
| 41 |
+
raise HTTPException(status_code=422, detail=exc.errors()) from exc
|
| 42 |
+
|
| 43 |
+
try:
|
| 44 |
+
observation, reward, done, info = env.step(action)
|
| 45 |
+
except RuntimeError as exc:
|
| 46 |
+
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
| 47 |
+
|
| 48 |
+
return {
|
| 49 |
+
"observation": observation.model_dump(),
|
| 50 |
+
"reward": reward,
|
| 51 |
+
"done": done,
|
| 52 |
+
"info": info,
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
@app.get("/state")
|
| 57 |
+
def state() -> dict[str, Any]:
|
| 58 |
+
return env.state()
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
@app.get("/schema")
|
| 62 |
+
def schema() -> dict[str, Any]:
|
| 63 |
+
return {
|
| 64 |
+
"action": FlakySleuthAction.model_json_schema(),
|
| 65 |
+
"observation": FlakySleuthObservation.model_json_schema(),
|
| 66 |
+
"state": FlakySleuthState.model_json_schema(),
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
@app.get("/health")
|
| 71 |
+
def health() -> dict[str, str]:
|
| 72 |
+
return {"status": "healthy"}
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
@app.get("/metadata")
|
| 76 |
+
def metadata() -> dict[str, str]:
|
| 77 |
+
return {
|
| 78 |
+
"name": "FlakySleuth Environment",
|
| 79 |
+
"description": (
|
| 80 |
+
"RL environment for flaky-test investigation in Python repositories."
|
| 81 |
+
),
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
@app.post("/mcp")
|
| 86 |
+
def mcp(payload: dict[str, Any] = Body(default_factory=dict)) -> dict[str, Any]:
|
| 87 |
+
request_id = payload.get("id")
|
| 88 |
+
return {
|
| 89 |
+
"jsonrpc": "2.0",
|
| 90 |
+
"id": request_id,
|
| 91 |
+
"result": {"status": "ok"},
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def main(host: str = "0.0.0.0", port: int = 8000) -> None:
|
| 96 |
+
import uvicorn
|
| 97 |
+
|
| 98 |
+
uvicorn.run(app, host=host, port=port)
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
if __name__ == "__main__":
|
| 102 |
+
main()
|
tests/test_compliance.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from env.environment import FlakySleuthEnv
|
| 2 |
+
from env.models import FlakySleuthAction
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def test_reset_and_step_smoke():
|
| 6 |
+
env = FlakySleuthEnv(dataset_path="dataset/py_tasks.csv")
|
| 7 |
+
obs = env.reset()
|
| 8 |
+
|
| 9 |
+
assert obs.test_name
|
| 10 |
+
assert obs.task_type in {"classify", "root_cause", "fix_proposal"}
|
| 11 |
+
|
| 12 |
+
action = FlakySleuthAction(action_type="search_code", argument="random")
|
| 13 |
+
next_obs, reward, done, info = env.step(action)
|
| 14 |
+
|
| 15 |
+
assert isinstance(next_obs.file_tree, list)
|
| 16 |
+
assert isinstance(reward, float)
|
| 17 |
+
assert isinstance(done, bool)
|
| 18 |
+
assert isinstance(info, dict)
|
uv.lock
ADDED
|
File without changes
|