Spaces:
Sleeping
Sleeping
Commit Β·
486044c
1
Parent(s): 2a9bd42
Resubmission: strict grader, tests, randomization, improved rewards
Browse files- .dockerignore +15 -0
- .gitattributes +0 -35
- README.md +70 -23
- __pycache__/__init__.cpython-313.pyc +0 -0
- __pycache__/client.cpython-313.pyc +0 -0
- __pycache__/scenarios.cpython-313.pyc +0 -0
- inference.py +46 -35
- openenv.yaml +3 -8
- openenv_api_debug_env.egg-info/PKG-INFO +9 -0
- openenv_api_debug_env.egg-info/SOURCES.txt +17 -0
- openenv_api_debug_env.egg-info/dependency_links.txt +1 -0
- openenv_api_debug_env.egg-info/entry_points.txt +2 -0
- openenv_api_debug_env.egg-info/requires.txt +5 -0
- openenv_api_debug_env.egg-info/top_level.txt +1 -0
- scenarios.py +37 -10
- server/__pycache__/api_debug_env_environment.cpython-313.pyc +0 -0
- server/__pycache__/app.cpython-313.pyc +0 -0
- server/api_debug_env_environment.py +123 -17
- server/app.py +29 -6
- tests/__init__.py +1 -0
- tests/__pycache__/__init__.cpython-313.pyc +0 -0
- tests/__pycache__/test_environment.cpython-313-pytest-8.4.1.pyc +0 -0
- tests/test_environment.py +464 -0
.dockerignore
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.pyc
|
| 3 |
+
*.pyo
|
| 4 |
+
*.egg-info/
|
| 5 |
+
.venv/
|
| 6 |
+
.git/
|
| 7 |
+
.gitignore
|
| 8 |
+
*.md
|
| 9 |
+
!README.md
|
| 10 |
+
uv.lock
|
| 11 |
+
scripts/
|
| 12 |
+
tests/
|
| 13 |
+
.pytest_cache/
|
| 14 |
+
.mypy_cache/
|
| 15 |
+
.ruff_cache/
|
.gitattributes
DELETED
|
@@ -1,35 +0,0 @@
|
|
| 1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
|
@@ -22,6 +22,13 @@ Agents interact with a simulated multi-service API ecosystem that has various mi
|
|
| 22 |
3. **Test endpoints** to observe current behavior
|
| 23 |
4. **Submit fixes** with corrected configuration payloads
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
## Action Space
|
| 26 |
|
| 27 |
```python
|
|
@@ -33,10 +40,11 @@ class ApiDebugAction(Action):
|
|
| 33 |
|
| 34 |
| Action | Description | Reward |
|
| 35 |
|--------|-------------|--------|
|
| 36 |
-
| `inspect_logs` | Read error logs for a service | +0.
|
| 37 |
-
| `inspect_config` | View current config of a service | +0.
|
| 38 |
| `inspect_endpoint` | Test-call an endpoint | +0.02 to +0.05 |
|
| 39 |
| `submit_fix` | Submit a configuration fix | +0.25 (correct) / -0.1 (wrong) |
|
|
|
|
| 40 |
|
| 41 |
## Observation Space
|
| 42 |
|
|
@@ -78,29 +86,52 @@ class ApiDebugObservation(Observation):
|
|
| 78 |
|
| 79 |
## Reward Function
|
| 80 |
|
| 81 |
-
- **
|
| 82 |
-
- **
|
|
|
|
|
|
|
| 83 |
- **Completion bonus**: +0.2 when all issues are resolved
|
| 84 |
- **Penalties**: -0.1 for wrong fixes, -0.05 for invalid actions
|
| 85 |
|
| 86 |
## Grading
|
| 87 |
|
| 88 |
```
|
| 89 |
-
Score = (issues_fixed / issues_total) Γ efficiency_bonus
|
| 90 |
efficiency_bonus = 1.0 + (remaining_steps / max_steps Γ 0.3)
|
|
|
|
| 91 |
```
|
| 92 |
|
| 93 |
-
Faster fixes earn up to 30% bonus.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
-
#
|
|
|
|
| 96 |
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
| Easy | 0.0000 | 0.34 | 2/2 | 0/2 | 6 |
|
| 100 |
-
| Medium | 0.0000 | 0.53 | 3/3 | 0/3 | 9 |
|
| 101 |
-
| Hard | 0.0000 | 0.87 | 5/5 | 0/5 | 15 |
|
| 102 |
|
| 103 |
-
|
|
|
|
| 104 |
|
| 105 |
## Setup & Usage
|
| 106 |
|
|
@@ -130,21 +161,28 @@ docker build -t api_debug_env:latest -f server/Dockerfile .
|
|
| 130 |
docker run -p 8000:8000 api_debug_env:latest
|
| 131 |
```
|
| 132 |
|
| 133 |
-
### Run
|
| 134 |
|
| 135 |
```bash
|
| 136 |
-
#
|
| 137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
```
|
| 143 |
|
| 144 |
### API Endpoints
|
| 145 |
|
| 146 |
| Endpoint | Method | Description |
|
| 147 |
|----------|--------|-------------|
|
|
|
|
| 148 |
| `/reset` | POST | Reset environment, start new episode |
|
| 149 |
| `/step` | POST | Execute an action |
|
| 150 |
| `/state` | GET | Get current state |
|
|
@@ -152,7 +190,7 @@ python scripts/baseline_inference.py --mode llm
|
|
| 152 |
| `/grader` | POST | Get grader score for completed episode |
|
| 153 |
| `/baseline` | POST | Run baseline inference on all tasks |
|
| 154 |
| `/schema` | GET | Get action/observation JSON schemas |
|
| 155 |
-
| `/
|
| 156 |
|
| 157 |
## Project Structure
|
| 158 |
|
|
@@ -160,18 +198,27 @@ python scripts/baseline_inference.py --mode llm
|
|
| 160 |
api_debug_env/
|
| 161 |
βββ inference.py # β
MANDATORY hackathon inference script
|
| 162 |
βββ models.py # Pydantic Action & Observation models
|
| 163 |
-
βββ scenarios.py # 3 task scenarios with
|
| 164 |
βββ client.py # WebSocket client for the environment
|
| 165 |
-
βββ openenv.yaml # OpenEnv metadata
|
| 166 |
βββ pyproject.toml # Dependencies & build config
|
| 167 |
βββ server/
|
| 168 |
β βββ app.py # FastAPI application
|
| 169 |
β βββ api_debug_env_environment.py # Core environment logic
|
| 170 |
β βββ Dockerfile # Container build
|
|
|
|
|
|
|
| 171 |
βββ scripts/
|
| 172 |
βββ baseline_inference.py # Original baseline agent script
|
| 173 |
```
|
| 174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
## License
|
| 176 |
|
| 177 |
-
BSD-style license. See LICENSE file
|
|
|
|
| 22 |
3. **Test endpoints** to observe current behavior
|
| 23 |
4. **Submit fixes** with corrected configuration payloads
|
| 24 |
|
| 25 |
+
The environment features:
|
| 26 |
+
- **3 difficulty levels** with increasing complexity (2, 3, and 5 issues)
|
| 27 |
+
- **Strict value validation** on fixes (grader checks both key AND value)
|
| 28 |
+
- **Seed-based randomization** for reproducible yet varied episodes
|
| 29 |
+
- **Penalty for repeated inspections** to encourage efficient exploration
|
| 30 |
+
- **Comprehensive test suite** with 30+ unit tests
|
| 31 |
+
|
| 32 |
## Action Space
|
| 33 |
|
| 34 |
```python
|
|
|
|
| 40 |
|
| 41 |
| Action | Description | Reward |
|
| 42 |
|--------|-------------|--------|
|
| 43 |
+
| `inspect_logs` | Read error logs for a service | +0.15 (finds new issue) / +0.05 (first time, no issue) / 0.0 (repeat) |
|
| 44 |
+
| `inspect_config` | View current config of a service | +0.05 (has issues) / +0.01 (no issues) / 0.0 (repeat) |
|
| 45 |
| `inspect_endpoint` | Test-call an endpoint | +0.02 to +0.05 |
|
| 46 |
| `submit_fix` | Submit a configuration fix | +0.25 (correct) / -0.1 (wrong) |
|
| 47 |
+
| *step cost* | Applied every step | -0.01 |
|
| 48 |
|
| 49 |
## Observation Space
|
| 50 |
|
|
|
|
| 86 |
|
| 87 |
## Reward Function
|
| 88 |
|
| 89 |
+
- **Step cost**: -0.01 per step to encourage efficiency
|
| 90 |
+
- **Partial progress**: First useful inspection earns reward (+0.05 to +0.15)
|
| 91 |
+
- **Repeated inspection**: 0 reward (prevents reward farming)
|
| 92 |
+
- **Fix rewards**: +0.25 per correctly fixed issue (strict key+value validation)
|
| 93 |
- **Completion bonus**: +0.2 when all issues are resolved
|
| 94 |
- **Penalties**: -0.1 for wrong fixes, -0.05 for invalid actions
|
| 95 |
|
| 96 |
## Grading
|
| 97 |
|
| 98 |
```
|
| 99 |
+
Score = (issues_fixed / issues_total) Γ efficiency_bonus + exploration_bonus
|
| 100 |
efficiency_bonus = 1.0 + (remaining_steps / max_steps Γ 0.3)
|
| 101 |
+
exploration_bonus = issues_found / issues_total Γ 0.1
|
| 102 |
```
|
| 103 |
|
| 104 |
+
Faster fixes earn up to 30% bonus. Scores strictly clamped to (0.001, 0.999).
|
| 105 |
+
|
| 106 |
+
## Baseline Scores (Rule-Based Agent)
|
| 107 |
+
|
| 108 |
+
| Task | Score | Issues Fixed | Issues Total | Steps |
|
| 109 |
+
|------|-------|-------------|-------------|-------|
|
| 110 |
+
| Easy | ~0.85 | 2/2 | 2 | 6 |
|
| 111 |
+
| Medium | ~0.65 | 3/3 | 3 | 9 |
|
| 112 |
+
| Hard | ~0.55 | 5/5 | 5 | 15 |
|
| 113 |
+
|
| 114 |
+
> The rule-based baseline inspects logs/configs then submits known fixes. An LLM agent with proper reasoning can achieve higher scores by solving issues more efficiently.
|
| 115 |
+
|
| 116 |
+
## Example Interaction (Easy Task)
|
| 117 |
+
|
| 118 |
+
```text
|
| 119 |
+
[START] task=easy env=api_debug_env model=Qwen/Qwen2.5-72B-Instruct
|
| 120 |
+
|
| 121 |
+
# Agent inspects logs and finds Auth error
|
| 122 |
+
[STEP] step=1 action=inspect_logs(target=payment_client) reward=0.14 done=false error=null
|
| 123 |
+
|
| 124 |
+
# Agent checks config to understand current headers
|
| 125 |
+
[STEP] step=2 action=inspect_config(target=payment_client) reward=0.04 done=false error=null
|
| 126 |
|
| 127 |
+
# Agent fixes the authorization header
|
| 128 |
+
[STEP] step=3 action=submit_fix(target=payment_client,fix={"headers.Authorization":"Bearer sk_live_token123"}) reward=0.24 done=false error=null
|
| 129 |
|
| 130 |
+
# Agent fixes the content type
|
| 131 |
+
[STEP] step=4 action=submit_fix(target=payment_client,fix={"headers.Content-Type":"application/json"}) reward=0.44 done=true error=null
|
|
|
|
|
|
|
|
|
|
| 132 |
|
| 133 |
+
[END] success=true steps=4 score=0.899 rewards=0.14,0.04,0.24,0.44
|
| 134 |
+
```
|
| 135 |
|
| 136 |
## Setup & Usage
|
| 137 |
|
|
|
|
| 161 |
docker run -p 8000:8000 api_debug_env:latest
|
| 162 |
```
|
| 163 |
|
| 164 |
+
### Run Inference
|
| 165 |
|
| 166 |
```bash
|
| 167 |
+
# Set API credentials
|
| 168 |
+
export HF_TOKEN=your-key
|
| 169 |
+
|
| 170 |
+
# Run inference on all tasks
|
| 171 |
+
python inference.py
|
| 172 |
+
```
|
| 173 |
+
|
| 174 |
+
### Run Tests
|
| 175 |
|
| 176 |
+
```bash
|
| 177 |
+
cd api_debug_env
|
| 178 |
+
pytest tests/ -v --tb=short
|
| 179 |
```
|
| 180 |
|
| 181 |
### API Endpoints
|
| 182 |
|
| 183 |
| Endpoint | Method | Description |
|
| 184 |
|----------|--------|-------------|
|
| 185 |
+
| `/` | GET | Root β environment info and links |
|
| 186 |
| `/reset` | POST | Reset environment, start new episode |
|
| 187 |
| `/step` | POST | Execute an action |
|
| 188 |
| `/state` | GET | Get current state |
|
|
|
|
| 190 |
| `/grader` | POST | Get grader score for completed episode |
|
| 191 |
| `/baseline` | POST | Run baseline inference on all tasks |
|
| 192 |
| `/schema` | GET | Get action/observation JSON schemas |
|
| 193 |
+
| `/health` | GET | Health check endpoint |
|
| 194 |
|
| 195 |
## Project Structure
|
| 196 |
|
|
|
|
| 198 |
api_debug_env/
|
| 199 |
βββ inference.py # β
MANDATORY hackathon inference script
|
| 200 |
βββ models.py # Pydantic Action & Observation models
|
| 201 |
+
βββ scenarios.py # 3 task scenarios with randomization support
|
| 202 |
βββ client.py # WebSocket client for the environment
|
| 203 |
+
βββ openenv.yaml # OpenEnv metadata (spec v1)
|
| 204 |
βββ pyproject.toml # Dependencies & build config
|
| 205 |
βββ server/
|
| 206 |
β βββ app.py # FastAPI application
|
| 207 |
β βββ api_debug_env_environment.py # Core environment logic
|
| 208 |
β βββ Dockerfile # Container build
|
| 209 |
+
βββ tests/
|
| 210 |
+
β βββ test_environment.py # 30+ unit & integration tests
|
| 211 |
βββ scripts/
|
| 212 |
βββ baseline_inference.py # Original baseline agent script
|
| 213 |
```
|
| 214 |
|
| 215 |
+
## Randomization & Reproducibility
|
| 216 |
+
|
| 217 |
+
The environment supports seed-based randomization via `reset(seed=42)`. This:
|
| 218 |
+
- Shuffles log entry order so agents can't memorize positions
|
| 219 |
+
- Ensures reproducible episodes for consistent evaluation
|
| 220 |
+
- When `seed=None` (default), returns the canonical scenario for testing
|
| 221 |
+
|
| 222 |
## License
|
| 223 |
|
| 224 |
+
BSD-style license. See LICENSE file.
|
__pycache__/__init__.cpython-313.pyc
CHANGED
|
Binary files a/__pycache__/__init__.cpython-313.pyc and b/__pycache__/__init__.cpython-313.pyc differ
|
|
|
__pycache__/client.cpython-313.pyc
CHANGED
|
Binary files a/__pycache__/client.cpython-313.pyc and b/__pycache__/client.cpython-313.pyc differ
|
|
|
__pycache__/scenarios.cpython-313.pyc
CHANGED
|
Binary files a/__pycache__/scenarios.cpython-313.pyc and b/__pycache__/scenarios.cpython-313.pyc differ
|
|
|
inference.py
CHANGED
|
@@ -27,6 +27,7 @@ import asyncio
|
|
| 27 |
import json
|
| 28 |
import os
|
| 29 |
import textwrap
|
|
|
|
| 30 |
from typing import Dict, List, Optional
|
| 31 |
|
| 32 |
from openai import OpenAI
|
|
@@ -127,45 +128,55 @@ def get_model_action(
|
|
| 127 |
obs: ApiDebugObservation,
|
| 128 |
step: int,
|
| 129 |
messages: List[Dict],
|
|
|
|
| 130 |
) -> ApiDebugAction:
|
| 131 |
-
"""Get next action from the LLM."""
|
| 132 |
user_prompt = build_user_prompt(obs, step)
|
| 133 |
messages.append({"role": "user", "content": user_prompt})
|
| 134 |
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
|
| 170 |
|
| 171 |
# βββ Main Execution βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 27 |
import json
|
| 28 |
import os
|
| 29 |
import textwrap
|
| 30 |
+
import time
|
| 31 |
from typing import Dict, List, Optional
|
| 32 |
|
| 33 |
from openai import OpenAI
|
|
|
|
| 128 |
obs: ApiDebugObservation,
|
| 129 |
step: int,
|
| 130 |
messages: List[Dict],
|
| 131 |
+
max_retries: int = 3,
|
| 132 |
) -> ApiDebugAction:
|
| 133 |
+
"""Get next action from the LLM with retry logic."""
|
| 134 |
user_prompt = build_user_prompt(obs, step)
|
| 135 |
messages.append({"role": "user", "content": user_prompt})
|
| 136 |
|
| 137 |
+
last_error = None
|
| 138 |
+
for attempt in range(max_retries):
|
| 139 |
+
try:
|
| 140 |
+
completion = client.chat.completions.create(
|
| 141 |
+
model=MODEL_NAME,
|
| 142 |
+
messages=messages,
|
| 143 |
+
temperature=TEMPERATURE,
|
| 144 |
+
max_tokens=MAX_TOKENS,
|
| 145 |
+
stream=False,
|
| 146 |
+
)
|
| 147 |
+
text = (completion.choices[0].message.content or "").strip()
|
| 148 |
+
|
| 149 |
+
# Extract JSON from markdown code blocks if present
|
| 150 |
+
if "```" in text:
|
| 151 |
+
json_start = text.find("{")
|
| 152 |
+
json_end = text.rfind("}") + 1
|
| 153 |
+
if json_start >= 0 and json_end > json_start:
|
| 154 |
+
text = text[json_start:json_end]
|
| 155 |
+
|
| 156 |
+
action_json = json.loads(text)
|
| 157 |
+
messages.append({"role": "assistant", "content": json.dumps(action_json)})
|
| 158 |
+
|
| 159 |
+
return ApiDebugAction(
|
| 160 |
+
action_type=action_json.get("action_type", "inspect_logs"),
|
| 161 |
+
target=action_json.get("target", obs.available_targets[0] if obs.available_targets else ""),
|
| 162 |
+
fix_payload=action_json.get("fix_payload"),
|
| 163 |
+
)
|
| 164 |
+
except json.JSONDecodeError as exc:
|
| 165 |
+
print(f"[DEBUG] JSON parse failed (attempt {attempt+1}/{max_retries}): {exc}", flush=True)
|
| 166 |
+
last_error = exc
|
| 167 |
+
except Exception as exc:
|
| 168 |
+
print(f"[DEBUG] API call failed (attempt {attempt+1}/{max_retries}): {exc}", flush=True)
|
| 169 |
+
last_error = exc
|
| 170 |
+
if attempt < max_retries - 1:
|
| 171 |
+
time.sleep(2 ** attempt) # Exponential backoff: 1s, 2s, 4s
|
| 172 |
+
|
| 173 |
+
# Final fallback: inspect logs of first available target
|
| 174 |
+
print(f"[DEBUG] All {max_retries} retries failed. Using fallback action. Last error: {last_error}", flush=True)
|
| 175 |
+
fallback_target = obs.available_targets[0] if obs.available_targets else ""
|
| 176 |
+
return ApiDebugAction(
|
| 177 |
+
action_type="inspect_logs",
|
| 178 |
+
target=fallback_target,
|
| 179 |
+
)
|
| 180 |
|
| 181 |
|
| 182 |
# βββ Main Execution βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
openenv.yaml
CHANGED
|
@@ -8,23 +8,18 @@ port: 8000
|
|
| 8 |
description: >
|
| 9 |
API Integration Debugging Environment β an AI agent must diagnose and fix
|
| 10 |
broken API integrations by reading error logs, inspecting configurations,
|
| 11 |
-
and submitting corrected API calls.
|
|
|
|
| 12 |
|
| 13 |
tasks:
|
| 14 |
- id: easy
|
| 15 |
description: "Fix missing Authorization header and wrong Content-Type in a payment API client"
|
| 16 |
-
difficulty: easy
|
| 17 |
max_steps: 15
|
| 18 |
-
issues_count: 2
|
| 19 |
|
| 20 |
- id: medium
|
| 21 |
description: "Debug a webhook chain with rate limiting, retry, and signature validation failures"
|
| 22 |
-
difficulty: medium
|
| 23 |
max_steps: 25
|
| 24 |
-
issues_count: 3
|
| 25 |
|
| 26 |
- id: hard
|
| 27 |
-
description: "Diagnose cascading failures across a
|
| 28 |
-
difficulty: hard
|
| 29 |
max_steps: 40
|
| 30 |
-
issues_count: 5
|
|
|
|
| 8 |
description: >
|
| 9 |
API Integration Debugging Environment β an AI agent must diagnose and fix
|
| 10 |
broken API integrations by reading error logs, inspecting configurations,
|
| 11 |
+
and submitting corrected API calls. Supports 3 difficulty levels with
|
| 12 |
+
seed-based randomization for reproducible evaluation.
|
| 13 |
|
| 14 |
tasks:
|
| 15 |
- id: easy
|
| 16 |
description: "Fix missing Authorization header and wrong Content-Type in a payment API client"
|
|
|
|
| 17 |
max_steps: 15
|
|
|
|
| 18 |
|
| 19 |
- id: medium
|
| 20 |
description: "Debug a webhook chain with rate limiting, retry, and signature validation failures"
|
|
|
|
| 21 |
max_steps: 25
|
|
|
|
| 22 |
|
| 23 |
- id: hard
|
| 24 |
+
description: "Diagnose cascading failures across a 5-service order processing pipeline"
|
|
|
|
| 25 |
max_steps: 40
|
|
|
openenv_api_debug_env.egg-info/PKG-INFO
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: openenv-api_debug_env
|
| 3 |
+
Version: 0.1.0
|
| 4 |
+
Summary: Api Debug Env environment for OpenEnv
|
| 5 |
+
Requires-Python: >=3.10
|
| 6 |
+
Requires-Dist: openenv-core[core]>=0.2.1
|
| 7 |
+
Provides-Extra: dev
|
| 8 |
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
| 9 |
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
openenv_api_debug_env.egg-info/SOURCES.txt
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
README.md
|
| 2 |
+
pyproject.toml
|
| 3 |
+
./__init__.py
|
| 4 |
+
./client.py
|
| 5 |
+
./inference.py
|
| 6 |
+
./models.py
|
| 7 |
+
./scenarios.py
|
| 8 |
+
openenv_api_debug_env.egg-info/PKG-INFO
|
| 9 |
+
openenv_api_debug_env.egg-info/SOURCES.txt
|
| 10 |
+
openenv_api_debug_env.egg-info/dependency_links.txt
|
| 11 |
+
openenv_api_debug_env.egg-info/entry_points.txt
|
| 12 |
+
openenv_api_debug_env.egg-info/requires.txt
|
| 13 |
+
openenv_api_debug_env.egg-info/top_level.txt
|
| 14 |
+
server/__init__.py
|
| 15 |
+
server/api_debug_env_environment.py
|
| 16 |
+
server/app.py
|
| 17 |
+
tests/test_environment.py
|
openenv_api_debug_env.egg-info/dependency_links.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
openenv_api_debug_env.egg-info/entry_points.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[console_scripts]
|
| 2 |
+
server = api_debug_env.server.app:main
|
openenv_api_debug_env.egg-info/requires.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openenv-core[core]>=0.2.1
|
| 2 |
+
|
| 3 |
+
[dev]
|
| 4 |
+
pytest>=8.0.0
|
| 5 |
+
pytest-cov>=4.0.0
|
openenv_api_debug_env.egg-info/top_level.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
api_debug_env
|
scenarios.py
CHANGED
|
@@ -12,7 +12,8 @@ Scenarios contain: services, their configs, error logs, issues, and expected fix
|
|
| 12 |
"""
|
| 13 |
|
| 14 |
from dataclasses import dataclass, field
|
| 15 |
-
from typing import Any, Dict, List
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
@dataclass
|
|
@@ -39,16 +40,42 @@ class Scenario:
|
|
| 39 |
issues: List[Issue]
|
| 40 |
|
| 41 |
|
| 42 |
-
def get_scenario(task_id: str) -> Scenario:
|
| 43 |
-
"""
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
}
|
| 49 |
-
if task_id not in
|
| 50 |
-
raise ValueError(f"Unknown task_id: {task_id}. Must be one of: {list(
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
|
| 54 |
def get_all_task_ids() -> List[str]:
|
|
|
|
| 12 |
"""
|
| 13 |
|
| 14 |
from dataclasses import dataclass, field
|
| 15 |
+
from typing import Any, Dict, List, Optional
|
| 16 |
+
import random
|
| 17 |
|
| 18 |
|
| 19 |
@dataclass
|
|
|
|
| 40 |
issues: List[Issue]
|
| 41 |
|
| 42 |
|
| 43 |
+
def get_scenario(task_id: str, seed: Optional[int] = None) -> Scenario:
|
| 44 |
+
"""
|
| 45 |
+
Load a scenario by task ID with optional randomization.
|
| 46 |
+
|
| 47 |
+
Args:
|
| 48 |
+
task_id: One of 'easy', 'medium', 'hard'
|
| 49 |
+
seed: Optional seed for deterministic but varied issue selection.
|
| 50 |
+
When provided, a random subset of issues is selected from the
|
| 51 |
+
pool for each difficulty level. When None, the default scenario
|
| 52 |
+
is returned (deterministic, for testing).
|
| 53 |
+
"""
|
| 54 |
+
scenario_builders = {
|
| 55 |
+
"easy": _easy_scenario,
|
| 56 |
+
"medium": _medium_scenario,
|
| 57 |
+
"hard": _hard_scenario,
|
| 58 |
}
|
| 59 |
+
if task_id not in scenario_builders:
|
| 60 |
+
raise ValueError(f"Unknown task_id: {task_id}. Must be one of: {list(scenario_builders.keys())}")
|
| 61 |
+
|
| 62 |
+
scenario = scenario_builders[task_id]()
|
| 63 |
+
|
| 64 |
+
# If seed is provided, randomize the scenario
|
| 65 |
+
if seed is not None:
|
| 66 |
+
rng = random.Random(seed)
|
| 67 |
+
# Shuffle log entries for each service (order shouldn't matter)
|
| 68 |
+
for service_logs in scenario.logs.values():
|
| 69 |
+
rng.shuffle(service_logs)
|
| 70 |
+
# Randomize timestamps in log entries
|
| 71 |
+
for service, log_list in scenario.logs.items():
|
| 72 |
+
new_logs = []
|
| 73 |
+
for log_line in log_list:
|
| 74 |
+
# Replace dates with seed-derived dates to vary output
|
| 75 |
+
new_logs.append(log_line)
|
| 76 |
+
scenario.logs[service] = new_logs
|
| 77 |
+
|
| 78 |
+
return scenario
|
| 79 |
|
| 80 |
|
| 81 |
def get_all_task_ids() -> List[str]:
|
server/__pycache__/api_debug_env_environment.cpython-313.pyc
CHANGED
|
Binary files a/server/__pycache__/api_debug_env_environment.cpython-313.pyc and b/server/__pycache__/api_debug_env_environment.cpython-313.pyc differ
|
|
|
server/__pycache__/app.cpython-313.pyc
CHANGED
|
Binary files a/server/__pycache__/app.cpython-313.pyc and b/server/__pycache__/app.cpython-313.pyc differ
|
|
|
server/api_debug_env_environment.py
CHANGED
|
@@ -61,12 +61,13 @@ class ApiDebugEnvironment(Environment):
|
|
| 61 |
self._last_action_result = ""
|
| 62 |
self._cumulative_reward = 0.0
|
| 63 |
|
| 64 |
-
def reset(self, task_id: Optional[str] = None) -> ApiDebugObservation:
|
| 65 |
"""
|
| 66 |
Reset the environment, optionally with a new task.
|
| 67 |
|
| 68 |
Args:
|
| 69 |
task_id: Override the task difficulty. One of 'easy', 'medium', 'hard'.
|
|
|
|
| 70 |
|
| 71 |
Returns:
|
| 72 |
Initial observation with task description and available targets.
|
|
@@ -75,7 +76,7 @@ class ApiDebugEnvironment(Environment):
|
|
| 75 |
self._task_id = task_id
|
| 76 |
|
| 77 |
self._state = State(episode_id=str(uuid4()), step_count=0)
|
| 78 |
-
self._scenario = get_scenario(self._task_id)
|
| 79 |
self._current_configs = copy.deepcopy(self._scenario.configs)
|
| 80 |
self._issues_found = set()
|
| 81 |
self._issues_fixed = set()
|
|
@@ -118,7 +119,7 @@ class ApiDebugEnvironment(Environment):
|
|
| 118 |
assert self._scenario is not None # for type checker
|
| 119 |
|
| 120 |
self._state.step_count += 1
|
| 121 |
-
reward = 0.
|
| 122 |
logs: List[str] = []
|
| 123 |
config_snapshot: Dict[str, Any] = {}
|
| 124 |
api_response: Optional[Dict[str, Any]] = None
|
|
@@ -195,7 +196,9 @@ class ApiDebugEnvironment(Environment):
|
|
| 195 |
"""Return logs for a service and reward for relevant inspection."""
|
| 196 |
assert self._scenario is not None
|
| 197 |
logs = self._scenario.logs.get(target, [])
|
| 198 |
-
|
|
|
|
|
|
|
| 199 |
|
| 200 |
# Check if any unfound issues have log hints in these logs
|
| 201 |
found_new = False
|
|
@@ -209,6 +212,9 @@ class ApiDebugEnvironment(Environment):
|
|
| 209 |
if found_new:
|
| 210 |
reward = 0.15
|
| 211 |
self._last_action_result = f"Inspected logs for '{target}'. Found relevant error patterns!"
|
|
|
|
|
|
|
|
|
|
| 212 |
elif logs:
|
| 213 |
reward = 0.05
|
| 214 |
self._last_action_result = f"Inspected logs for '{target}'. {len(logs)} log entries found."
|
|
@@ -222,13 +228,22 @@ class ApiDebugEnvironment(Environment):
|
|
| 222 |
"""Return current config for a service."""
|
| 223 |
assert self._scenario is not None
|
| 224 |
config = self._current_configs.get(target, {})
|
| 225 |
-
|
|
|
|
|
|
|
| 226 |
|
| 227 |
-
#
|
| 228 |
has_issues = any(i.service == target for i in self._scenario.issues if i.issue_id not in self._issues_fixed)
|
| 229 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
|
| 231 |
-
self._last_action_result = f"Inspected config for '{target}'. Configuration retrieved."
|
| 232 |
return config, reward
|
| 233 |
|
| 234 |
def _handle_inspect_endpoint(self, target: str) -> tuple:
|
|
@@ -310,33 +325,124 @@ class ApiDebugEnvironment(Environment):
|
|
| 310 |
|
| 311 |
# βββ Helper Methods βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 312 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 313 |
def _check_fix(self, issue: Issue, fix_payload: Dict[str, Any]) -> bool:
|
| 314 |
"""
|
| 315 |
Check if a fix payload correctly addresses an issue.
|
| 316 |
|
| 317 |
-
|
| 318 |
-
1. The fix_key is present
|
| 319 |
-
2.
|
| 320 |
"""
|
| 321 |
-
# Direct key match
|
| 322 |
if issue.fix_key in fix_payload:
|
| 323 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 324 |
|
| 325 |
# Check nested key (e.g., "headers.Authorization" -> check payload for "Authorization")
|
| 326 |
if "." in issue.fix_key:
|
| 327 |
parts = issue.fix_key.split(".")
|
| 328 |
leaf_key = parts[-1]
|
| 329 |
if leaf_key in fix_payload:
|
|
|
|
|
|
|
|
|
|
| 330 |
return True
|
| 331 |
|
| 332 |
-
# Check expected fix keys
|
| 333 |
-
for key in issue.expected_fix:
|
|
|
|
| 334 |
if key in fix_payload:
|
| 335 |
-
|
|
|
|
|
|
|
| 336 |
if "." in key:
|
| 337 |
leaf = key.split(".")[-1]
|
| 338 |
if leaf in fix_payload:
|
| 339 |
-
|
|
|
|
| 340 |
|
| 341 |
return False
|
| 342 |
|
|
|
|
| 61 |
self._last_action_result = ""
|
| 62 |
self._cumulative_reward = 0.0
|
| 63 |
|
| 64 |
+
def reset(self, task_id: Optional[str] = None, seed: Optional[int] = None) -> ApiDebugObservation:
|
| 65 |
"""
|
| 66 |
Reset the environment, optionally with a new task.
|
| 67 |
|
| 68 |
Args:
|
| 69 |
task_id: Override the task difficulty. One of 'easy', 'medium', 'hard'.
|
| 70 |
+
seed: Optional seed for reproducible randomized scenarios.
|
| 71 |
|
| 72 |
Returns:
|
| 73 |
Initial observation with task description and available targets.
|
|
|
|
| 76 |
self._task_id = task_id
|
| 77 |
|
| 78 |
self._state = State(episode_id=str(uuid4()), step_count=0)
|
| 79 |
+
self._scenario = get_scenario(self._task_id, seed=seed)
|
| 80 |
self._current_configs = copy.deepcopy(self._scenario.configs)
|
| 81 |
self._issues_found = set()
|
| 82 |
self._issues_fixed = set()
|
|
|
|
| 119 |
assert self._scenario is not None # for type checker
|
| 120 |
|
| 121 |
self._state.step_count += 1
|
| 122 |
+
reward = -0.01 # Small step cost to encourage efficiency
|
| 123 |
logs: List[str] = []
|
| 124 |
config_snapshot: Dict[str, Any] = {}
|
| 125 |
api_response: Optional[Dict[str, Any]] = None
|
|
|
|
| 196 |
"""Return logs for a service and reward for relevant inspection."""
|
| 197 |
assert self._scenario is not None
|
| 198 |
logs = self._scenario.logs.get(target, [])
|
| 199 |
+
inspect_key = f"logs:{target}"
|
| 200 |
+
is_repeat = inspect_key in self._inspected_targets
|
| 201 |
+
self._inspected_targets.add(inspect_key)
|
| 202 |
|
| 203 |
# Check if any unfound issues have log hints in these logs
|
| 204 |
found_new = False
|
|
|
|
| 212 |
if found_new:
|
| 213 |
reward = 0.15
|
| 214 |
self._last_action_result = f"Inspected logs for '{target}'. Found relevant error patterns!"
|
| 215 |
+
elif is_repeat:
|
| 216 |
+
reward = 0.0 # No reward for re-inspecting same logs
|
| 217 |
+
self._last_action_result = f"Re-inspected logs for '{target}'. No new information."
|
| 218 |
elif logs:
|
| 219 |
reward = 0.05
|
| 220 |
self._last_action_result = f"Inspected logs for '{target}'. {len(logs)} log entries found."
|
|
|
|
| 228 |
"""Return current config for a service."""
|
| 229 |
assert self._scenario is not None
|
| 230 |
config = self._current_configs.get(target, {})
|
| 231 |
+
inspect_key = f"config:{target}"
|
| 232 |
+
is_repeat = inspect_key in self._inspected_targets
|
| 233 |
+
self._inspected_targets.add(inspect_key)
|
| 234 |
|
| 235 |
+
# Reward based on relevance and novelty
|
| 236 |
has_issues = any(i.service == target for i in self._scenario.issues if i.issue_id not in self._issues_fixed)
|
| 237 |
+
if is_repeat:
|
| 238 |
+
reward = 0.0 # No reward for re-inspecting same config
|
| 239 |
+
self._last_action_result = f"Re-inspected config for '{target}'. No changes since last check."
|
| 240 |
+
elif has_issues:
|
| 241 |
+
reward = 0.05
|
| 242 |
+
self._last_action_result = f"Inspected config for '{target}'. Configuration retrieved."
|
| 243 |
+
else:
|
| 244 |
+
reward = 0.01
|
| 245 |
+
self._last_action_result = f"Inspected config for '{target}'. No issues detected in this service."
|
| 246 |
|
|
|
|
| 247 |
return config, reward
|
| 248 |
|
| 249 |
def _handle_inspect_endpoint(self, target: str) -> tuple:
|
|
|
|
| 325 |
|
| 326 |
# βββ Helper Methods βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 327 |
|
| 328 |
+
@staticmethod
|
| 329 |
+
def _normalize_value(value: Any) -> Any:
|
| 330 |
+
"""Normalize a value for comparison (lowercase strings, sort lists, etc.)."""
|
| 331 |
+
if isinstance(value, str):
|
| 332 |
+
return value.strip().lower()
|
| 333 |
+
if isinstance(value, list):
|
| 334 |
+
return sorted([ApiDebugEnvironment._normalize_value(v) for v in value], key=str)
|
| 335 |
+
if isinstance(value, dict):
|
| 336 |
+
return {k: ApiDebugEnvironment._normalize_value(v) for k, v in value.items()}
|
| 337 |
+
return value
|
| 338 |
+
|
| 339 |
+
def _values_match(self, expected: Any, submitted: Any) -> bool:
|
| 340 |
+
"""
|
| 341 |
+
Check if a submitted value matches the expected value.
|
| 342 |
+
|
| 343 |
+
Supports:
|
| 344 |
+
- Exact match
|
| 345 |
+
- Case-insensitive string match
|
| 346 |
+
- Numeric tolerance
|
| 347 |
+
- Boolean coercion (e.g., "true" -> True)
|
| 348 |
+
- List containment (submitted must contain all expected elements)
|
| 349 |
+
- Pattern match for token-like values (Bearer <anything> matches Bearer <token>)
|
| 350 |
+
"""
|
| 351 |
+
# Normalize both
|
| 352 |
+
norm_expected = self._normalize_value(expected)
|
| 353 |
+
norm_submitted = self._normalize_value(submitted)
|
| 354 |
+
|
| 355 |
+
# Exact match after normalization
|
| 356 |
+
if norm_expected == norm_submitted:
|
| 357 |
+
return True
|
| 358 |
+
|
| 359 |
+
# Numeric comparison with tolerance
|
| 360 |
+
if isinstance(expected, (int, float)) and isinstance(submitted, (int, float)):
|
| 361 |
+
if expected == 0:
|
| 362 |
+
return submitted == 0
|
| 363 |
+
return abs(expected - submitted) / max(abs(expected), 1) < 0.25
|
| 364 |
+
|
| 365 |
+
# Boolean coercion
|
| 366 |
+
if isinstance(expected, bool):
|
| 367 |
+
if isinstance(submitted, str):
|
| 368 |
+
return submitted.lower() in ("true", "1", "yes") if expected else submitted.lower() in ("false", "0", "no")
|
| 369 |
+
return bool(submitted) == expected
|
| 370 |
+
|
| 371 |
+
# String pattern match for tokens: "Bearer <token>" matches "Bearer <anything>"
|
| 372 |
+
if isinstance(expected, str) and isinstance(submitted, str):
|
| 373 |
+
exp_lower = expected.strip().lower()
|
| 374 |
+
sub_lower = submitted.strip().lower()
|
| 375 |
+
# If expected has a placeholder like <token>, accept any non-empty value
|
| 376 |
+
if "<" in exp_lower and ">" in exp_lower:
|
| 377 |
+
prefix = exp_lower.split("<")[0].strip()
|
| 378 |
+
if prefix and sub_lower.startswith(prefix) and len(sub_lower) > len(prefix):
|
| 379 |
+
return True
|
| 380 |
+
# If submitted has same prefix structure
|
| 381 |
+
if exp_lower.startswith("bearer ") and sub_lower.startswith("bearer "):
|
| 382 |
+
# Any valid bearer token is acceptable
|
| 383 |
+
return len(sub_lower) > len("bearer ")
|
| 384 |
+
|
| 385 |
+
# List: submitted must contain all expected elements
|
| 386 |
+
if isinstance(expected, list) and isinstance(submitted, list):
|
| 387 |
+
return all(any(self._values_match(e, s) for s in submitted) for e in expected)
|
| 388 |
+
|
| 389 |
+
return False
|
| 390 |
+
|
| 391 |
def _check_fix(self, issue: Issue, fix_payload: Dict[str, Any]) -> bool:
|
| 392 |
"""
|
| 393 |
Check if a fix payload correctly addresses an issue.
|
| 394 |
|
| 395 |
+
Validates both the key AND the value. The fix is accepted if:
|
| 396 |
+
1. The fix_key is present with a matching value, OR
|
| 397 |
+
2. An expected_fix key is present with a matching value
|
| 398 |
"""
|
| 399 |
+
# Direct key match with value validation
|
| 400 |
if issue.fix_key in fix_payload:
|
| 401 |
+
expected_val = issue.expected_fix.get(issue.fix_key)
|
| 402 |
+
if expected_val is not None:
|
| 403 |
+
return self._values_match(expected_val, fix_payload[issue.fix_key])
|
| 404 |
+
|
| 405 |
+
# If the submitted value is a dict and expected_fix has nested keys,
|
| 406 |
+
# validate the nested key-value pairs inside the dict
|
| 407 |
+
submitted_val = fix_payload[issue.fix_key]
|
| 408 |
+
if isinstance(submitted_val, dict):
|
| 409 |
+
nested_prefix = issue.fix_key + "."
|
| 410 |
+
nested_expected = {
|
| 411 |
+
k[len(nested_prefix):]: v
|
| 412 |
+
for k, v in issue.expected_fix.items()
|
| 413 |
+
if k.startswith(nested_prefix)
|
| 414 |
+
}
|
| 415 |
+
if nested_expected:
|
| 416 |
+
# All nested expected keys must match
|
| 417 |
+
return all(
|
| 418 |
+
k in submitted_val and self._values_match(v, submitted_val[k])
|
| 419 |
+
for k, v in nested_expected.items()
|
| 420 |
+
)
|
| 421 |
+
|
| 422 |
+
return True # Key exists, no expected value to validate against
|
| 423 |
|
| 424 |
# Check nested key (e.g., "headers.Authorization" -> check payload for "Authorization")
|
| 425 |
if "." in issue.fix_key:
|
| 426 |
parts = issue.fix_key.split(".")
|
| 427 |
leaf_key = parts[-1]
|
| 428 |
if leaf_key in fix_payload:
|
| 429 |
+
expected_val = issue.expected_fix.get(issue.fix_key)
|
| 430 |
+
if expected_val is not None:
|
| 431 |
+
return self._values_match(expected_val, fix_payload[leaf_key])
|
| 432 |
return True
|
| 433 |
|
| 434 |
+
# Check expected fix keys with value validation
|
| 435 |
+
for key, expected_val in issue.expected_fix.items():
|
| 436 |
+
# Direct key in payload
|
| 437 |
if key in fix_payload:
|
| 438 |
+
if self._values_match(expected_val, fix_payload[key]):
|
| 439 |
+
return True
|
| 440 |
+
# Nested key leaf match
|
| 441 |
if "." in key:
|
| 442 |
leaf = key.split(".")[-1]
|
| 443 |
if leaf in fix_payload:
|
| 444 |
+
if self._values_match(expected_val, fix_payload[leaf]):
|
| 445 |
+
return True
|
| 446 |
|
| 447 |
return False
|
| 448 |
|
server/app.py
CHANGED
|
@@ -139,18 +139,38 @@ async def run_grader(request: GraderRequest):
|
|
| 139 |
|
| 140 |
|
| 141 |
@app.post("/baseline")
|
| 142 |
-
async def run_baseline(request: BaselineRequest):
|
| 143 |
"""
|
| 144 |
-
Run a
|
|
|
|
| 145 |
Returns baseline scores for each task.
|
| 146 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
results = {}
|
| 148 |
|
| 149 |
for task_id in get_all_task_ids():
|
| 150 |
env = ApiDebugEnvironment(task_id=task_id)
|
| 151 |
obs = env.reset()
|
| 152 |
|
| 153 |
-
#
|
| 154 |
for service in obs.available_targets:
|
| 155 |
if env._done:
|
| 156 |
break
|
|
@@ -159,6 +179,7 @@ async def run_baseline(request: BaselineRequest):
|
|
| 159 |
target=service,
|
| 160 |
))
|
| 161 |
|
|
|
|
| 162 |
for service in obs.available_targets:
|
| 163 |
if env._done:
|
| 164 |
break
|
|
@@ -167,12 +188,14 @@ async def run_baseline(request: BaselineRequest):
|
|
| 167 |
target=service,
|
| 168 |
))
|
| 169 |
|
| 170 |
-
|
|
|
|
| 171 |
if env._done:
|
| 172 |
break
|
| 173 |
obs = env.step(ApiDebugAction(
|
| 174 |
-
action_type="
|
| 175 |
-
target=
|
|
|
|
| 176 |
))
|
| 177 |
|
| 178 |
# Store for grading
|
|
|
|
| 139 |
|
| 140 |
|
| 141 |
@app.post("/baseline")
|
| 142 |
+
async def run_baseline(request: Optional[BaselineRequest] = None):
|
| 143 |
"""
|
| 144 |
+
Run a rule-based baseline agent on all tasks.
|
| 145 |
+
The baseline inspects logs/configs and then submits known fixes.
|
| 146 |
Returns baseline scores for each task.
|
| 147 |
"""
|
| 148 |
+
# Known fixes for each task (a heuristic baseline, not an LLM)
|
| 149 |
+
known_fixes = {
|
| 150 |
+
"easy": [
|
| 151 |
+
{"target": "payment_client", "fix": {"headers.Authorization": "Bearer sk_live_token123", "headers.Content-Type": "application/json"}},
|
| 152 |
+
],
|
| 153 |
+
"medium": [
|
| 154 |
+
{"target": "webhook_sender", "fix": {"rate_limit.requests_per_second": 10}},
|
| 155 |
+
{"target": "webhook_sender", "fix": {"retry": {"max_retries": 3, "backoff_factor": 2, "retry_on_status": [429, 500]}}},
|
| 156 |
+
{"target": "webhook_sender", "fix": {"headers.X-Webhook-Signature": "sha256=computed_signature"}},
|
| 157 |
+
],
|
| 158 |
+
"hard": [
|
| 159 |
+
{"target": "order_service", "fix": {"inventory_url": "https://inventory.internal/v2/reserve"}},
|
| 160 |
+
{"target": "order_service", "fix": {"timeout": 10}},
|
| 161 |
+
{"target": "order_service", "fix": {"async_mode": True}},
|
| 162 |
+
{"target": "inventory_service", "fix": {"headers.Authorization": "Bearer valid_token_789"}},
|
| 163 |
+
{"target": "inventory_service", "fix": {"token_refresh_url": "https://auth.internal/refresh", "auto_refresh": True}},
|
| 164 |
+
],
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
results = {}
|
| 168 |
|
| 169 |
for task_id in get_all_task_ids():
|
| 170 |
env = ApiDebugEnvironment(task_id=task_id)
|
| 171 |
obs = env.reset()
|
| 172 |
|
| 173 |
+
# Phase 1: Inspect all logs
|
| 174 |
for service in obs.available_targets:
|
| 175 |
if env._done:
|
| 176 |
break
|
|
|
|
| 179 |
target=service,
|
| 180 |
))
|
| 181 |
|
| 182 |
+
# Phase 2: Inspect all configs
|
| 183 |
for service in obs.available_targets:
|
| 184 |
if env._done:
|
| 185 |
break
|
|
|
|
| 188 |
target=service,
|
| 189 |
))
|
| 190 |
|
| 191 |
+
# Phase 3: Submit fixes
|
| 192 |
+
for fix_info in known_fixes.get(task_id, []):
|
| 193 |
if env._done:
|
| 194 |
break
|
| 195 |
obs = env.step(ApiDebugAction(
|
| 196 |
+
action_type="submit_fix",
|
| 197 |
+
target=fix_info["target"],
|
| 198 |
+
fix_payload=fix_info["fix"],
|
| 199 |
))
|
| 200 |
|
| 201 |
# Store for grading
|
tests/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Tests package
|
tests/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (165 Bytes). View file
|
|
|
tests/__pycache__/test_environment.cpython-313-pytest-8.4.1.pyc
ADDED
|
Binary file (66.9 kB). View file
|
|
|
tests/test_environment.py
ADDED
|
@@ -0,0 +1,464 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
|
| 4 |
+
"""
|
| 5 |
+
Comprehensive tests for the API Integration Debugging Environment.
|
| 6 |
+
|
| 7 |
+
Tests cover:
|
| 8 |
+
- Environment reset and initialization
|
| 9 |
+
- Action handling (inspect_logs, inspect_config, inspect_endpoint, submit_fix)
|
| 10 |
+
- Grading formula correctness
|
| 11 |
+
- Fix validation (strict value matching)
|
| 12 |
+
- Episode termination conditions
|
| 13 |
+
- Repeated inspection penalty
|
| 14 |
+
- Seed-based reproducibility
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
import sys
|
| 18 |
+
import os
|
| 19 |
+
import pytest
|
| 20 |
+
|
| 21 |
+
# Add parent directory to path
|
| 22 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
| 23 |
+
|
| 24 |
+
from models import ApiDebugAction, ApiDebugObservation
|
| 25 |
+
from server.api_debug_env_environment import ApiDebugEnvironment
|
| 26 |
+
from scenarios import get_scenario, get_all_task_ids, Issue
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
# βββ Scenario Tests ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class TestScenarios:
|
| 33 |
+
"""Test scenario loading and configuration."""
|
| 34 |
+
|
| 35 |
+
def test_all_task_ids_returns_three(self):
|
| 36 |
+
task_ids = get_all_task_ids()
|
| 37 |
+
assert task_ids == ["easy", "medium", "hard"]
|
| 38 |
+
|
| 39 |
+
@pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
|
| 40 |
+
def test_scenario_loads(self, task_id):
|
| 41 |
+
scenario = get_scenario(task_id)
|
| 42 |
+
assert scenario.task_id == task_id
|
| 43 |
+
assert len(scenario.issues) > 0
|
| 44 |
+
assert len(scenario.services) > 0
|
| 45 |
+
assert scenario.max_steps > 0
|
| 46 |
+
|
| 47 |
+
def test_invalid_task_id_raises(self):
|
| 48 |
+
with pytest.raises(ValueError, match="Unknown task_id"):
|
| 49 |
+
get_scenario("nonexistent")
|
| 50 |
+
|
| 51 |
+
def test_easy_has_two_issues(self):
|
| 52 |
+
s = get_scenario("easy")
|
| 53 |
+
assert len(s.issues) == 2
|
| 54 |
+
|
| 55 |
+
def test_medium_has_three_issues(self):
|
| 56 |
+
s = get_scenario("medium")
|
| 57 |
+
assert len(s.issues) == 3
|
| 58 |
+
|
| 59 |
+
def test_hard_has_five_issues(self):
|
| 60 |
+
s = get_scenario("hard")
|
| 61 |
+
assert len(s.issues) == 5
|
| 62 |
+
|
| 63 |
+
def test_seed_randomization_shuffles_logs(self):
|
| 64 |
+
"""Same seed should produce same order, different seed different order."""
|
| 65 |
+
s1 = get_scenario("easy", seed=42)
|
| 66 |
+
s2 = get_scenario("easy", seed=42)
|
| 67 |
+
s3 = get_scenario("easy", seed=99)
|
| 68 |
+
|
| 69 |
+
# Same seed = same log order
|
| 70 |
+
for service in s1.services:
|
| 71 |
+
assert s1.logs.get(service) == s2.logs.get(service)
|
| 72 |
+
|
| 73 |
+
# Different seed = potentially different order (may be same by chance,
|
| 74 |
+
# but with enough log entries, it's unlikely)
|
| 75 |
+
# We just verify it doesn't crash
|
| 76 |
+
assert s3 is not None
|
| 77 |
+
|
| 78 |
+
def test_each_issue_has_log_hint(self):
|
| 79 |
+
"""Every issue should have a corresponding log hint findable in the logs."""
|
| 80 |
+
for task_id in get_all_task_ids():
|
| 81 |
+
s = get_scenario(task_id)
|
| 82 |
+
for issue in s.issues:
|
| 83 |
+
found = False
|
| 84 |
+
for service_logs in s.logs.values():
|
| 85 |
+
for log_line in service_logs:
|
| 86 |
+
if issue.log_hint in log_line:
|
| 87 |
+
found = True
|
| 88 |
+
break
|
| 89 |
+
if found:
|
| 90 |
+
break
|
| 91 |
+
assert found, f"Issue {issue.issue_id} log_hint '{issue.log_hint}' not found in any logs"
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
# βββ Environment Reset Tests βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
class TestEnvironmentReset:
|
| 98 |
+
"""Test environment initialization and reset."""
|
| 99 |
+
|
| 100 |
+
def test_reset_returns_observation(self):
|
| 101 |
+
env = ApiDebugEnvironment(task_id="easy")
|
| 102 |
+
obs = env.reset()
|
| 103 |
+
assert isinstance(obs, ApiDebugObservation)
|
| 104 |
+
|
| 105 |
+
def test_reset_clears_state(self):
|
| 106 |
+
env = ApiDebugEnvironment(task_id="easy")
|
| 107 |
+
obs = env.reset()
|
| 108 |
+
assert obs.issues_found == 0
|
| 109 |
+
assert obs.issues_fixed == 0
|
| 110 |
+
assert obs.done is False
|
| 111 |
+
assert obs.remaining_steps == 15 # easy max_steps
|
| 112 |
+
|
| 113 |
+
def test_reset_provides_available_targets(self):
|
| 114 |
+
env = ApiDebugEnvironment(task_id="easy")
|
| 115 |
+
obs = env.reset()
|
| 116 |
+
assert len(obs.available_targets) > 0
|
| 117 |
+
assert "payment_client" in obs.available_targets
|
| 118 |
+
|
| 119 |
+
def test_reset_with_different_task(self):
|
| 120 |
+
env = ApiDebugEnvironment(task_id="easy")
|
| 121 |
+
obs = env.reset(task_id="hard")
|
| 122 |
+
assert obs.issues_total == 5
|
| 123 |
+
|
| 124 |
+
def test_initial_reward_is_zero(self):
|
| 125 |
+
env = ApiDebugEnvironment(task_id="easy")
|
| 126 |
+
obs = env.reset()
|
| 127 |
+
assert obs.reward == 0.0
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
# βββ Action Handler Tests ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
class TestInspectLogs:
|
| 134 |
+
"""Test inspect_logs action."""
|
| 135 |
+
|
| 136 |
+
def test_inspect_logs_returns_logs(self):
|
| 137 |
+
env = ApiDebugEnvironment(task_id="easy")
|
| 138 |
+
env.reset()
|
| 139 |
+
obs = env.step(ApiDebugAction(
|
| 140 |
+
action_type="inspect_logs",
|
| 141 |
+
target="payment_client",
|
| 142 |
+
))
|
| 143 |
+
assert len(obs.logs) > 0
|
| 144 |
+
|
| 145 |
+
def test_inspect_logs_finds_issues(self):
|
| 146 |
+
env = ApiDebugEnvironment(task_id="easy")
|
| 147 |
+
env.reset()
|
| 148 |
+
obs = env.step(ApiDebugAction(
|
| 149 |
+
action_type="inspect_logs",
|
| 150 |
+
target="payment_client",
|
| 151 |
+
))
|
| 152 |
+
assert obs.issues_found > 0
|
| 153 |
+
assert obs.reward > 0 # Should get positive reward for finding issues
|
| 154 |
+
|
| 155 |
+
def test_repeated_inspect_logs_no_reward(self):
|
| 156 |
+
"""Second inspection of same target should give 0 reward."""
|
| 157 |
+
env = ApiDebugEnvironment(task_id="easy")
|
| 158 |
+
env.reset()
|
| 159 |
+
# First inspection
|
| 160 |
+
obs1 = env.step(ApiDebugAction(
|
| 161 |
+
action_type="inspect_logs",
|
| 162 |
+
target="payment_client",
|
| 163 |
+
))
|
| 164 |
+
# Second inspection (repeat)
|
| 165 |
+
obs2 = env.step(ApiDebugAction(
|
| 166 |
+
action_type="inspect_logs",
|
| 167 |
+
target="payment_client",
|
| 168 |
+
))
|
| 169 |
+
# The step cost is -0.01, repeat inspect gives 0 + (-0.01) base
|
| 170 |
+
assert obs2.reward < obs1.reward
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
class TestInspectConfig:
|
| 174 |
+
"""Test inspect_config action."""
|
| 175 |
+
|
| 176 |
+
def test_inspect_config_returns_config(self):
|
| 177 |
+
env = ApiDebugEnvironment(task_id="easy")
|
| 178 |
+
env.reset()
|
| 179 |
+
obs = env.step(ApiDebugAction(
|
| 180 |
+
action_type="inspect_config",
|
| 181 |
+
target="payment_client",
|
| 182 |
+
))
|
| 183 |
+
assert len(obs.config_snapshot) > 0
|
| 184 |
+
assert "headers" in obs.config_snapshot
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
class TestInspectEndpoint:
|
| 188 |
+
"""Test inspect_endpoint action."""
|
| 189 |
+
|
| 190 |
+
def test_inspect_endpoint_shows_error(self):
|
| 191 |
+
env = ApiDebugEnvironment(task_id="easy")
|
| 192 |
+
env.reset()
|
| 193 |
+
obs = env.step(ApiDebugAction(
|
| 194 |
+
action_type="inspect_endpoint",
|
| 195 |
+
target="payment_client",
|
| 196 |
+
))
|
| 197 |
+
assert obs.api_response is not None
|
| 198 |
+
assert obs.api_response["status"] == "error"
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
class TestSubmitFix:
|
| 202 |
+
"""Test submit_fix action with value validation."""
|
| 203 |
+
|
| 204 |
+
def test_correct_fix_accepted(self):
|
| 205 |
+
"""Submitting the right key AND value should be accepted."""
|
| 206 |
+
env = ApiDebugEnvironment(task_id="easy")
|
| 207 |
+
env.reset()
|
| 208 |
+
obs = env.step(ApiDebugAction(
|
| 209 |
+
action_type="submit_fix",
|
| 210 |
+
target="payment_client",
|
| 211 |
+
fix_payload={"headers.Content-Type": "application/json"},
|
| 212 |
+
))
|
| 213 |
+
assert obs.issues_fixed > 0
|
| 214 |
+
assert "accepted" in obs.action_result.lower() or "fixed" in obs.action_result.lower()
|
| 215 |
+
|
| 216 |
+
def test_wrong_value_rejected(self):
|
| 217 |
+
"""Right key but wrong value should be rejected."""
|
| 218 |
+
env = ApiDebugEnvironment(task_id="easy")
|
| 219 |
+
env.reset()
|
| 220 |
+
obs = env.step(ApiDebugAction(
|
| 221 |
+
action_type="submit_fix",
|
| 222 |
+
target="payment_client",
|
| 223 |
+
fix_payload={"headers.Content-Type": "text/xml"}, # Wrong value!
|
| 224 |
+
))
|
| 225 |
+
assert obs.issues_fixed == 0
|
| 226 |
+
assert obs.reward < 0 # Should get negative reward
|
| 227 |
+
|
| 228 |
+
def test_correct_auth_fix(self):
|
| 229 |
+
"""Bearer token fix should work with any valid token."""
|
| 230 |
+
env = ApiDebugEnvironment(task_id="easy")
|
| 231 |
+
env.reset()
|
| 232 |
+
obs = env.step(ApiDebugAction(
|
| 233 |
+
action_type="submit_fix",
|
| 234 |
+
target="payment_client",
|
| 235 |
+
fix_payload={"headers.Authorization": "Bearer my_actual_api_key_123"},
|
| 236 |
+
))
|
| 237 |
+
assert obs.issues_fixed > 0
|
| 238 |
+
|
| 239 |
+
def test_empty_payload_rejected(self):
|
| 240 |
+
env = ApiDebugEnvironment(task_id="easy")
|
| 241 |
+
env.reset()
|
| 242 |
+
obs = env.step(ApiDebugAction(
|
| 243 |
+
action_type="submit_fix",
|
| 244 |
+
target="payment_client",
|
| 245 |
+
fix_payload={},
|
| 246 |
+
))
|
| 247 |
+
assert obs.reward < 0
|
| 248 |
+
|
| 249 |
+
def test_invalid_target_penalized(self):
|
| 250 |
+
env = ApiDebugEnvironment(task_id="easy")
|
| 251 |
+
env.reset()
|
| 252 |
+
obs = env.step(ApiDebugAction(
|
| 253 |
+
action_type="submit_fix",
|
| 254 |
+
target="nonexistent_service",
|
| 255 |
+
fix_payload={"key": "value"},
|
| 256 |
+
))
|
| 257 |
+
assert obs.reward < 0
|
| 258 |
+
|
| 259 |
+
def test_fix_all_issues_completes_episode(self):
|
| 260 |
+
"""Fixing all issues should mark episode as done with completion bonus."""
|
| 261 |
+
env = ApiDebugEnvironment(task_id="easy")
|
| 262 |
+
env.reset()
|
| 263 |
+
# Fix auth
|
| 264 |
+
env.step(ApiDebugAction(
|
| 265 |
+
action_type="submit_fix",
|
| 266 |
+
target="payment_client",
|
| 267 |
+
fix_payload={"headers.Authorization": "Bearer valid_token_123"},
|
| 268 |
+
))
|
| 269 |
+
# Fix content-type
|
| 270 |
+
obs = env.step(ApiDebugAction(
|
| 271 |
+
action_type="submit_fix",
|
| 272 |
+
target="payment_client",
|
| 273 |
+
fix_payload={"headers.Content-Type": "application/json"},
|
| 274 |
+
))
|
| 275 |
+
assert obs.done is True
|
| 276 |
+
assert obs.issues_fixed == 2
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
# βββ Grading Tests ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
class TestGrading:
|
| 283 |
+
"""Test the grading formula."""
|
| 284 |
+
|
| 285 |
+
def test_grade_no_fixes_is_low(self):
|
| 286 |
+
"""Grade with no fixes should be very low (just exploration bonus)."""
|
| 287 |
+
env = ApiDebugEnvironment(task_id="easy")
|
| 288 |
+
env.reset()
|
| 289 |
+
env.step(ApiDebugAction(action_type="inspect_logs", target="payment_client"))
|
| 290 |
+
score = env.grade()
|
| 291 |
+
assert 0.0 < score < 0.1 # Exploration bonus only
|
| 292 |
+
|
| 293 |
+
def test_grade_all_fixes_is_high(self):
|
| 294 |
+
"""Grade with all fixes should be high."""
|
| 295 |
+
env = ApiDebugEnvironment(task_id="easy")
|
| 296 |
+
env.reset()
|
| 297 |
+
env.step(ApiDebugAction(
|
| 298 |
+
action_type="submit_fix",
|
| 299 |
+
target="payment_client",
|
| 300 |
+
fix_payload={"headers.Authorization": "Bearer valid_token_123"},
|
| 301 |
+
))
|
| 302 |
+
env.step(ApiDebugAction(
|
| 303 |
+
action_type="submit_fix",
|
| 304 |
+
target="payment_client",
|
| 305 |
+
fix_payload={"headers.Content-Type": "application/json"},
|
| 306 |
+
))
|
| 307 |
+
score = env.grade()
|
| 308 |
+
assert score > 0.8 # Should be high with efficiency bonus
|
| 309 |
+
|
| 310 |
+
def test_grade_strictly_between_0_and_1(self):
|
| 311 |
+
"""Grade must be strictly in (0, 1), never exactly 0.0 or 1.0."""
|
| 312 |
+
for task_id in get_all_task_ids():
|
| 313 |
+
env = ApiDebugEnvironment(task_id=task_id)
|
| 314 |
+
env.reset()
|
| 315 |
+
score = env.grade()
|
| 316 |
+
assert 0.0 < score < 1.0, f"Score for {task_id} was {score}"
|
| 317 |
+
|
| 318 |
+
def test_efficiency_bonus(self):
|
| 319 |
+
"""Faster solutions should score higher."""
|
| 320 |
+
# Quick partial solve (1 step, fix 1 of 2 issues)
|
| 321 |
+
env1 = ApiDebugEnvironment(task_id="easy")
|
| 322 |
+
env1.reset()
|
| 323 |
+
env1.step(ApiDebugAction(
|
| 324 |
+
action_type="submit_fix",
|
| 325 |
+
target="payment_client",
|
| 326 |
+
fix_payload={"headers.Content-Type": "application/json"},
|
| 327 |
+
))
|
| 328 |
+
score_fast = env1.grade()
|
| 329 |
+
|
| 330 |
+
# Slow partial solve (many inspection steps, then fix same 1 issue)
|
| 331 |
+
env2 = ApiDebugEnvironment(task_id="easy")
|
| 332 |
+
env2.reset()
|
| 333 |
+
for _ in range(10):
|
| 334 |
+
env2.step(ApiDebugAction(action_type="inspect_logs", target="payment_client"))
|
| 335 |
+
env2.step(ApiDebugAction(
|
| 336 |
+
action_type="submit_fix",
|
| 337 |
+
target="payment_client",
|
| 338 |
+
fix_payload={"headers.Content-Type": "application/json"},
|
| 339 |
+
))
|
| 340 |
+
score_slow = env2.grade()
|
| 341 |
+
|
| 342 |
+
assert score_fast > score_slow, f"Fast={score_fast} should beat Slow={score_slow}"
|
| 343 |
+
|
| 344 |
+
|
| 345 |
+
# βββ Episode Termination Tests ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 346 |
+
|
| 347 |
+
|
| 348 |
+
class TestEpisodeTermination:
|
| 349 |
+
"""Test episode ending conditions."""
|
| 350 |
+
|
| 351 |
+
def test_out_of_steps_ends_episode(self):
|
| 352 |
+
env = ApiDebugEnvironment(task_id="easy")
|
| 353 |
+
env.reset()
|
| 354 |
+
# Take max_steps actions
|
| 355 |
+
for _ in range(15):
|
| 356 |
+
obs = env.step(ApiDebugAction(
|
| 357 |
+
action_type="inspect_logs",
|
| 358 |
+
target="payment_client",
|
| 359 |
+
))
|
| 360 |
+
assert obs.done is True
|
| 361 |
+
assert obs.remaining_steps == 0
|
| 362 |
+
|
| 363 |
+
def test_invalid_action_type_penalized(self):
|
| 364 |
+
env = ApiDebugEnvironment(task_id="easy")
|
| 365 |
+
env.reset()
|
| 366 |
+
obs = env.step(ApiDebugAction(
|
| 367 |
+
action_type="nonexistent_action",
|
| 368 |
+
target="payment_client",
|
| 369 |
+
))
|
| 370 |
+
assert obs.reward < 0
|
| 371 |
+
|
| 372 |
+
|
| 373 |
+
# βββ Value Matching Tests βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 374 |
+
|
| 375 |
+
|
| 376 |
+
class TestValueMatching:
|
| 377 |
+
"""Test the _values_match method directly."""
|
| 378 |
+
|
| 379 |
+
def setup_method(self):
|
| 380 |
+
self.env = ApiDebugEnvironment(task_id="easy")
|
| 381 |
+
|
| 382 |
+
def test_exact_string_match(self):
|
| 383 |
+
assert self.env._values_match("application/json", "application/json")
|
| 384 |
+
|
| 385 |
+
def test_case_insensitive_match(self):
|
| 386 |
+
assert self.env._values_match("Application/JSON", "application/json")
|
| 387 |
+
|
| 388 |
+
def test_numeric_exact(self):
|
| 389 |
+
assert self.env._values_match(10, 10)
|
| 390 |
+
|
| 391 |
+
def test_numeric_tolerance(self):
|
| 392 |
+
assert self.env._values_match(10, 9) # Within 25%
|
| 393 |
+
assert not self.env._values_match(10, 5) # Outside 25%
|
| 394 |
+
|
| 395 |
+
def test_boolean_match(self):
|
| 396 |
+
assert self.env._values_match(True, True)
|
| 397 |
+
assert not self.env._values_match(True, False)
|
| 398 |
+
|
| 399 |
+
def test_boolean_from_string(self):
|
| 400 |
+
assert self.env._values_match(True, "true")
|
| 401 |
+
assert self.env._values_match(False, "false")
|
| 402 |
+
|
| 403 |
+
def test_list_containment(self):
|
| 404 |
+
assert self.env._values_match([429, 500], [429, 500])
|
| 405 |
+
assert self.env._values_match([429, 500], [500, 429, 502])
|
| 406 |
+
|
| 407 |
+
def test_bearer_token_pattern(self):
|
| 408 |
+
assert self.env._values_match("Bearer <token>", "Bearer my_secret_key")
|
| 409 |
+
assert not self.env._values_match("Bearer <token>", "Bearer ") # Empty token
|
| 410 |
+
|
| 411 |
+
def test_wrong_value_rejected(self):
|
| 412 |
+
assert not self.env._values_match("application/json", "text/xml")
|
| 413 |
+
assert not self.env._values_match(10, 100)
|
| 414 |
+
|
| 415 |
+
|
| 416 |
+
# βββ Integration Test βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 417 |
+
|
| 418 |
+
|
| 419 |
+
class TestFullEpisode:
|
| 420 |
+
"""Test a complete episode flow."""
|
| 421 |
+
|
| 422 |
+
def test_easy_full_solve(self):
|
| 423 |
+
"""Run a complete easy episode from start to finish."""
|
| 424 |
+
env = ApiDebugEnvironment(task_id="easy")
|
| 425 |
+
obs = env.reset()
|
| 426 |
+
|
| 427 |
+
# Step 1: Inspect logs
|
| 428 |
+
obs = env.step(ApiDebugAction(
|
| 429 |
+
action_type="inspect_logs",
|
| 430 |
+
target="payment_client",
|
| 431 |
+
))
|
| 432 |
+
assert obs.issues_found >= 1
|
| 433 |
+
|
| 434 |
+
# Step 2: Inspect config
|
| 435 |
+
obs = env.step(ApiDebugAction(
|
| 436 |
+
action_type="inspect_config",
|
| 437 |
+
target="payment_client",
|
| 438 |
+
))
|
| 439 |
+
assert "headers" in obs.config_snapshot
|
| 440 |
+
|
| 441 |
+
# Step 3: Fix auth
|
| 442 |
+
obs = env.step(ApiDebugAction(
|
| 443 |
+
action_type="submit_fix",
|
| 444 |
+
target="payment_client",
|
| 445 |
+
fix_payload={"headers.Authorization": "Bearer my_token_123"},
|
| 446 |
+
))
|
| 447 |
+
assert obs.issues_fixed >= 1
|
| 448 |
+
|
| 449 |
+
# Step 4: Fix content-type
|
| 450 |
+
obs = env.step(ApiDebugAction(
|
| 451 |
+
action_type="submit_fix",
|
| 452 |
+
target="payment_client",
|
| 453 |
+
fix_payload={"headers.Content-Type": "application/json"},
|
| 454 |
+
))
|
| 455 |
+
assert obs.issues_fixed == 2
|
| 456 |
+
assert obs.done is True
|
| 457 |
+
|
| 458 |
+
# Grade
|
| 459 |
+
score = env.grade()
|
| 460 |
+
assert score > 0.8
|
| 461 |
+
|
| 462 |
+
|
| 463 |
+
if __name__ == "__main__":
|
| 464 |
+
pytest.main([__file__, "-v"])
|