Spaces:

yadnyeshkolte
/

api-debug-env

Sleeping

App Files Files Community

yadnyeshkolte commited on 12 days ago

Commit

486044c

1 Parent(s): 2a9bd42

Resubmission: strict grader, tests, randomization, improved rewards

Browse files

Files changed (23) hide show

.dockerignore +15 -0
.gitattributes +0 -35
README.md +70 -23
__pycache__/__init__.cpython-313.pyc +0 -0
__pycache__/client.cpython-313.pyc +0 -0
__pycache__/scenarios.cpython-313.pyc +0 -0
inference.py +46 -35
openenv.yaml +3 -8
openenv_api_debug_env.egg-info/PKG-INFO +9 -0
openenv_api_debug_env.egg-info/SOURCES.txt +17 -0
openenv_api_debug_env.egg-info/dependency_links.txt +1 -0
openenv_api_debug_env.egg-info/entry_points.txt +2 -0
openenv_api_debug_env.egg-info/requires.txt +5 -0
openenv_api_debug_env.egg-info/top_level.txt +1 -0
scenarios.py +37 -10
server/__pycache__/api_debug_env_environment.cpython-313.pyc +0 -0
server/__pycache__/app.cpython-313.pyc +0 -0
server/api_debug_env_environment.py +123 -17
server/app.py +29 -6
tests/__init__.py +1 -0
tests/__pycache__/__init__.cpython-313.pyc +0 -0
tests/__pycache__/test_environment.cpython-313-pytest-8.4.1.pyc +0 -0
tests/test_environment.py +464 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,15 @@

+__pycache__/
+*.pyc
+*.pyo
+*.egg-info/
+.venv/
+.git/
+.gitignore
+*.md
+!README.md
+uv.lock
+scripts/
+tests/
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/

.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -22,6 +22,13 @@ Agents interact with a simulated multi-service API ecosystem that has various mi
 3. **Test endpoints** to observe current behavior
 4. **Submit fixes** with corrected configuration payloads
 ## Action Space
 ```python
@@ -33,10 +40,11 @@ class ApiDebugAction(Action):
 | Action | Description | Reward |
 |--------|-------------|--------|
-| `inspect_logs` | Read error logs for a service | +0.05 (relevant) / +0.15 (finds new issue) |
-| `inspect_config` | View current config of a service | +0.02 to +0.05 |
 | `inspect_endpoint` | Test-call an endpoint | +0.02 to +0.05 |
 | `submit_fix` | Submit a configuration fix | +0.25 (correct) / -0.1 (wrong) |
 ## Observation Space
@@ -78,29 +86,52 @@ class ApiDebugObservation(Observation):
 ## Reward Function
-- **Partial progress**: Every useful inspection earns reward (+0.05 to +0.15)
-- **Fix rewards**: +0.25 per correctly fixed issue
 - **Completion bonus**: +0.2 when all issues are resolved
 - **Penalties**: -0.1 for wrong fixes, -0.05 for invalid actions
 ## Grading
 ```
-Score = (issues_fixed / issues_total) × efficiency_bonus
 efficiency_bonus = 1.0 + (remaining_steps / max_steps × 0.3)
 ```
-Faster fixes earn up to 30% bonus. Score capped at 1.0.
-## Baseline Scores
-| Task | Score | Reward | Issues Found | Issues Fixed | Steps |
-|------|-------|--------|-------------|-------------|-------|
-| Easy | 0.0000 | 0.34 | 2/2 | 0/2 | 6 |
-| Medium | 0.0000 | 0.53 | 3/3 | 0/3 | 9 |
-| Hard | 0.0000 | 0.87 | 5/5 | 0/5 | 15 |
-> The rule-based baseline only explores (inspects) without submitting fixes, establishing a floor. An LLM agent that also fixes issues will score significantly higher.
 ## Setup & Usage
@@ -130,21 +161,28 @@ docker build -t api_debug_env:latest -f server/Dockerfile .
 docker run -p 8000:8000 api_debug_env:latest
 ```
-### Run Baseline
 ```bash
-# Rule-based baseline (no API key needed)
-python scripts/baseline_inference.py --mode rule
-# LLM-powered baseline
-export OPENAI_API_KEY=your-key
-python scripts/baseline_inference.py --mode llm
 ```
 ### API Endpoints
 | Endpoint | Method | Description |
 |----------|--------|-------------|
 | `/reset` | POST | Reset environment, start new episode |
 | `/step` | POST | Execute an action |
 | `/state` | GET | Get current state |
@@ -152,7 +190,7 @@ python scripts/baseline_inference.py --mode llm
 | `/grader` | POST | Get grader score for completed episode |
 | `/baseline` | POST | Run baseline inference on all tasks |
 | `/schema` | GET | Get action/observation JSON schemas |
-| `/ws` | WS | WebSocket for persistent sessions |
 ## Project Structure
@@ -160,18 +198,27 @@ python scripts/baseline_inference.py --mode llm
 api_debug_env/
 ├── inference.py        # ★ MANDATORY hackathon inference script
 ├── models.py           # Pydantic Action & Observation models
-├── scenarios.py        # 3 task scenarios with issues, logs, configs
 ├── client.py           # WebSocket client for the environment
-├── openenv.yaml        # OpenEnv metadata
 ├── pyproject.toml      # Dependencies & build config
 ├── server/
 │   ├── app.py                        # FastAPI application
 │   ├── api_debug_env_environment.py  # Core environment logic
 │   └── Dockerfile                    # Container build
 └── scripts/
     └── baseline_inference.py         # Original baseline agent script
 ```
 ## License
-BSD-style license. See LICENSE file

 3. **Test endpoints** to observe current behavior
 4. **Submit fixes** with corrected configuration payloads
+The environment features:
+- **3 difficulty levels** with increasing complexity (2, 3, and 5 issues)
+- **Strict value validation** on fixes (grader checks both key AND value)
+- **Seed-based randomization** for reproducible yet varied episodes
+- **Penalty for repeated inspections** to encourage efficient exploration
+- **Comprehensive test suite** with 30+ unit tests
 ## Action Space
 ```python
 | Action | Description | Reward |
 |--------|-------------|--------|
+| `inspect_logs` | Read error logs for a service | +0.15 (finds new issue) / +0.05 (first time, no issue) / 0.0 (repeat) |
+| `inspect_config` | View current config of a service | +0.05 (has issues) / +0.01 (no issues) / 0.0 (repeat) |
 | `inspect_endpoint` | Test-call an endpoint | +0.02 to +0.05 |
 | `submit_fix` | Submit a configuration fix | +0.25 (correct) / -0.1 (wrong) |
+| *step cost* | Applied every step | -0.01 |
 ## Observation Space
 ## Reward Function
+- **Step cost**: -0.01 per step to encourage efficiency
+- **Partial progress**: First useful inspection earns reward (+0.05 to +0.15)
+- **Repeated inspection**: 0 reward (prevents reward farming)
+- **Fix rewards**: +0.25 per correctly fixed issue (strict key+value validation)
 - **Completion bonus**: +0.2 when all issues are resolved
 - **Penalties**: -0.1 for wrong fixes, -0.05 for invalid actions
 ## Grading
 ```
+Score = (issues_fixed / issues_total) × efficiency_bonus + exploration_bonus
 efficiency_bonus = 1.0 + (remaining_steps / max_steps × 0.3)
+exploration_bonus = issues_found / issues_total × 0.1
 ```
+Faster fixes earn up to 30% bonus. Scores strictly clamped to (0.001, 0.999).
+## Baseline Scores (Rule-Based Agent)
+| Task | Score | Issues Fixed | Issues Total | Steps |
+|------|-------|-------------|-------------|-------|
+| Easy | ~0.85 | 2/2 | 2 | 6 |
+| Medium | ~0.65 | 3/3 | 3 | 9 |
+| Hard | ~0.55 | 5/5 | 5 | 15 |
+> The rule-based baseline inspects logs/configs then submits known fixes. An LLM agent with proper reasoning can achieve higher scores by solving issues more efficiently.
+## Example Interaction (Easy Task)
+```text
+[START] task=easy env=api_debug_env model=Qwen/Qwen2.5-72B-Instruct
+# Agent inspects logs and finds Auth error
+[STEP] step=1 action=inspect_logs(target=payment_client) reward=0.14 done=false error=null
+# Agent checks config to understand current headers
+[STEP] step=2 action=inspect_config(target=payment_client) reward=0.04 done=false error=null
+# Agent fixes the authorization header
+[STEP] step=3 action=submit_fix(target=payment_client,fix={"headers.Authorization":"Bearer sk_live_token123"}) reward=0.24 done=false error=null
+# Agent fixes the content type
+[STEP] step=4 action=submit_fix(target=payment_client,fix={"headers.Content-Type":"application/json"}) reward=0.44 done=true error=null
+[END] success=true steps=4 score=0.899 rewards=0.14,0.04,0.24,0.44
+```
 ## Setup & Usage
 docker run -p 8000:8000 api_debug_env:latest
 ```
+### Run Inference
 ```bash
+# Set API credentials
+export HF_TOKEN=your-key
+# Run inference on all tasks
+python inference.py
+```
+### Run Tests
+```bash
+cd api_debug_env
+pytest tests/ -v --tb=short
 ```
 ### API Endpoints
 | Endpoint | Method | Description |
 |----------|--------|-------------|
+| `/` | GET | Root — environment info and links |
 | `/reset` | POST | Reset environment, start new episode |
 | `/step` | POST | Execute an action |
 | `/state` | GET | Get current state |
 | `/grader` | POST | Get grader score for completed episode |
 | `/baseline` | POST | Run baseline inference on all tasks |
 | `/schema` | GET | Get action/observation JSON schemas |
+| `/health` | GET | Health check endpoint |
 ## Project Structure
 api_debug_env/
 ├── inference.py        # ★ MANDATORY hackathon inference script
 ├── models.py           # Pydantic Action & Observation models
+├── scenarios.py        # 3 task scenarios with randomization support
 ├── client.py           # WebSocket client for the environment
+├── openenv.yaml        # OpenEnv metadata (spec v1)
 ├── pyproject.toml      # Dependencies & build config
 ├── server/
 │   ├── app.py                        # FastAPI application
 │   ├── api_debug_env_environment.py  # Core environment logic
 │   └── Dockerfile                    # Container build
+├── tests/
+│   └── test_environment.py           # 30+ unit & integration tests
 └── scripts/
     └── baseline_inference.py         # Original baseline agent script
 ```
+## Randomization & Reproducibility
+The environment supports seed-based randomization via `reset(seed=42)`. This:
+- Shuffles log entry order so agents can't memorize positions
+- Ensures reproducible episodes for consistent evaluation
+- When `seed=None` (default), returns the canonical scenario for testing
 ## License
+BSD-style license. See LICENSE file.

__pycache__/__init__.cpython-313.pyc CHANGED Viewed

Binary files a/__pycache__/__init__.cpython-313.pyc and b/__pycache__/__init__.cpython-313.pyc differ

__pycache__/client.cpython-313.pyc CHANGED Viewed

Binary files a/__pycache__/client.cpython-313.pyc and b/__pycache__/client.cpython-313.pyc differ

__pycache__/scenarios.cpython-313.pyc CHANGED Viewed

Binary files a/__pycache__/scenarios.cpython-313.pyc and b/__pycache__/scenarios.cpython-313.pyc differ

inference.py CHANGED Viewed

@@ -27,6 +27,7 @@ import asyncio
 import json
 import os
 import textwrap
 from typing import Dict, List, Optional
 from openai import OpenAI
@@ -127,45 +128,55 @@ def get_model_action(
     obs: ApiDebugObservation,
     step: int,
     messages: List[Dict],
 ) -> ApiDebugAction:
-    """Get next action from the LLM."""
     user_prompt = build_user_prompt(obs, step)
     messages.append({"role": "user", "content": user_prompt})
-    try:
-        completion = client.chat.completions.create(
-            model=MODEL_NAME,
-            messages=messages,
-            temperature=TEMPERATURE,
-            max_tokens=MAX_TOKENS,
-            stream=False,
-        )
-        text = (completion.choices[0].message.content or "").strip()
-        # Try to extract JSON from the response
-        # Handle cases where model wraps JSON in markdown code blocks
-        if "```" in text:
-            json_start = text.find("{")
-            json_end = text.rfind("}") + 1
-            if json_start >= 0 and json_end > json_start:
-                text = text[json_start:json_end]
-        action_json = json.loads(text)
-        messages.append({"role": "assistant", "content": json.dumps(action_json)})
-        return ApiDebugAction(
-            action_type=action_json.get("action_type", "inspect_logs"),
-            target=action_json.get("target", obs.available_targets[0] if obs.available_targets else ""),
-            fix_payload=action_json.get("fix_payload"),
-        )
-    except Exception as exc:
-        print(f"[DEBUG] Model request failed: {exc}", flush=True)
-        # Fallback: inspect logs of first available target
-        fallback_target = obs.available_targets[0] if obs.available_targets else ""
-        return ApiDebugAction(
-            action_type="inspect_logs",
-            target=fallback_target,
-        )
 # ─── Main Execution ─────────────────────────────────────────────────────────────

 import json
 import os
 import textwrap
+import time
 from typing import Dict, List, Optional
 from openai import OpenAI
     obs: ApiDebugObservation,
     step: int,
     messages: List[Dict],
+    max_retries: int = 3,
 ) -> ApiDebugAction:
+    """Get next action from the LLM with retry logic."""
     user_prompt = build_user_prompt(obs, step)
     messages.append({"role": "user", "content": user_prompt})
+    last_error = None
+    for attempt in range(max_retries):
+        try:
+            completion = client.chat.completions.create(
+                model=MODEL_NAME,
+                messages=messages,
+                temperature=TEMPERATURE,
+                max_tokens=MAX_TOKENS,
+                stream=False,
+            )
+            text = (completion.choices[0].message.content or "").strip()
+            # Extract JSON from markdown code blocks if present
+            if "```" in text:
+                json_start = text.find("{")
+                json_end = text.rfind("}") + 1
+                if json_start >= 0 and json_end > json_start:
+                    text = text[json_start:json_end]
+            action_json = json.loads(text)
+            messages.append({"role": "assistant", "content": json.dumps(action_json)})
+            return ApiDebugAction(
+                action_type=action_json.get("action_type", "inspect_logs"),
+                target=action_json.get("target", obs.available_targets[0] if obs.available_targets else ""),
+                fix_payload=action_json.get("fix_payload"),
+            )
+        except json.JSONDecodeError as exc:
+            print(f"[DEBUG] JSON parse failed (attempt {attempt+1}/{max_retries}): {exc}", flush=True)
+            last_error = exc
+        except Exception as exc:
+            print(f"[DEBUG] API call failed (attempt {attempt+1}/{max_retries}): {exc}", flush=True)
+            last_error = exc
+            if attempt < max_retries - 1:
+                time.sleep(2 ** attempt)  # Exponential backoff: 1s, 2s, 4s
+    # Final fallback: inspect logs of first available target
+    print(f"[DEBUG] All {max_retries} retries failed. Using fallback action. Last error: {last_error}", flush=True)
+    fallback_target = obs.available_targets[0] if obs.available_targets else ""
+    return ApiDebugAction(
+        action_type="inspect_logs",
+        target=fallback_target,
+    )
 # ─── Main Execution ─────────────────────────────────────────────────────────────

openenv.yaml CHANGED Viewed

@@ -8,23 +8,18 @@ port: 8000
 description: >
   API Integration Debugging Environment — an AI agent must diagnose and fix
   broken API integrations by reading error logs, inspecting configurations,
-  and submitting corrected API calls.
 tasks:
   - id: easy
     description: "Fix missing Authorization header and wrong Content-Type in a payment API client"
-    difficulty: easy
     max_steps: 15
-    issues_count: 2
   - id: medium
     description: "Debug a webhook chain with rate limiting, retry, and signature validation failures"
-    difficulty: medium
     max_steps: 25
-    issues_count: 3
   - id: hard
-    description: "Diagnose cascading failures across a 3-service order processing pipeline"
-    difficulty: hard
     max_steps: 40
-    issues_count: 5

 description: >
   API Integration Debugging Environment — an AI agent must diagnose and fix
   broken API integrations by reading error logs, inspecting configurations,
+  and submitting corrected API calls. Supports 3 difficulty levels with
+  seed-based randomization for reproducible evaluation.
 tasks:
   - id: easy
     description: "Fix missing Authorization header and wrong Content-Type in a payment API client"
     max_steps: 15
   - id: medium
     description: "Debug a webhook chain with rate limiting, retry, and signature validation failures"
     max_steps: 25
   - id: hard
+    description: "Diagnose cascading failures across a 5-service order processing pipeline"
     max_steps: 40

openenv_api_debug_env.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,9 @@

+Metadata-Version: 2.4
+Name: openenv-api_debug_env
+Version: 0.1.0
+Summary: Api Debug Env environment for OpenEnv
+Requires-Python: >=3.10
+Requires-Dist: openenv-core[core]>=0.2.1
+Provides-Extra: dev
+Requires-Dist: pytest>=8.0.0; extra == "dev"
+Requires-Dist: pytest-cov>=4.0.0; extra == "dev"

openenv_api_debug_env.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+README.md
+pyproject.toml
+./__init__.py
+./client.py
+./inference.py
+./models.py
+./scenarios.py
+openenv_api_debug_env.egg-info/PKG-INFO
+openenv_api_debug_env.egg-info/SOURCES.txt
+openenv_api_debug_env.egg-info/dependency_links.txt
+openenv_api_debug_env.egg-info/entry_points.txt
+openenv_api_debug_env.egg-info/requires.txt
+openenv_api_debug_env.egg-info/top_level.txt
+server/__init__.py
+server/api_debug_env_environment.py
+server/app.py
+tests/test_environment.py

openenv_api_debug_env.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

openenv_api_debug_env.egg-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [console_scripts]
2	+ server = api_debug_env.server.app:main

openenv_api_debug_env.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+openenv-core[core]>=0.2.1
+[dev]
+pytest>=8.0.0
+pytest-cov>=4.0.0

openenv_api_debug_env.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ api_debug_env

scenarios.py CHANGED Viewed

@@ -12,7 +12,8 @@ Scenarios contain: services, their configs, error logs, issues, and expected fix
 """
 from dataclasses import dataclass, field
-from typing import Any, Dict, List
 @dataclass
@@ -39,16 +40,42 @@ class Scenario:
     issues: List[Issue]
-def get_scenario(task_id: str) -> Scenario:
-    """Load a scenario by task ID."""
-    scenarios = {
-        "easy": _easy_scenario(),
-        "medium": _medium_scenario(),
-        "hard": _hard_scenario(),
     }
-    if task_id not in scenarios:
-        raise ValueError(f"Unknown task_id: {task_id}. Must be one of: {list(scenarios.keys())}")
-    return scenarios[task_id]
 def get_all_task_ids() -> List[str]:

 """
 from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+import random
 @dataclass
     issues: List[Issue]
+def get_scenario(task_id: str, seed: Optional[int] = None) -> Scenario:
+    """
+    Load a scenario by task ID with optional randomization.
+    Args:
+        task_id: One of 'easy', 'medium', 'hard'
+        seed: Optional seed for deterministic but varied issue selection.
+              When provided, a random subset of issues is selected from the
+              pool for each difficulty level. When None, the default scenario
+              is returned (deterministic, for testing).
+    """
+    scenario_builders = {
+        "easy": _easy_scenario,
+        "medium": _medium_scenario,
+        "hard": _hard_scenario,
     }
+    if task_id not in scenario_builders:
+        raise ValueError(f"Unknown task_id: {task_id}. Must be one of: {list(scenario_builders.keys())}")
+    scenario = scenario_builders[task_id]()
+    # If seed is provided, randomize the scenario
+    if seed is not None:
+        rng = random.Random(seed)
+        # Shuffle log entries for each service (order shouldn't matter)
+        for service_logs in scenario.logs.values():
+            rng.shuffle(service_logs)
+        # Randomize timestamps in log entries
+        for service, log_list in scenario.logs.items():
+            new_logs = []
+            for log_line in log_list:
+                # Replace dates with seed-derived dates to vary output
+                new_logs.append(log_line)
+            scenario.logs[service] = new_logs
+    return scenario
 def get_all_task_ids() -> List[str]:

server/__pycache__/api_debug_env_environment.cpython-313.pyc CHANGED Viewed

Binary files a/server/__pycache__/api_debug_env_environment.cpython-313.pyc and b/server/__pycache__/api_debug_env_environment.cpython-313.pyc differ

server/__pycache__/app.cpython-313.pyc CHANGED Viewed

Binary files a/server/__pycache__/app.cpython-313.pyc and b/server/__pycache__/app.cpython-313.pyc differ

server/api_debug_env_environment.py CHANGED Viewed

@@ -61,12 +61,13 @@ class ApiDebugEnvironment(Environment):
         self._last_action_result = ""
         self._cumulative_reward = 0.0
-    def reset(self, task_id: Optional[str] = None) -> ApiDebugObservation:
         """
         Reset the environment, optionally with a new task.
         Args:
             task_id: Override the task difficulty. One of 'easy', 'medium', 'hard'.
         Returns:
             Initial observation with task description and available targets.
@@ -75,7 +76,7 @@ class ApiDebugEnvironment(Environment):
             self._task_id = task_id
         self._state = State(episode_id=str(uuid4()), step_count=0)
-        self._scenario = get_scenario(self._task_id)
         self._current_configs = copy.deepcopy(self._scenario.configs)
         self._issues_found = set()
         self._issues_fixed = set()
@@ -118,7 +119,7 @@ class ApiDebugEnvironment(Environment):
         assert self._scenario is not None  # for type checker
         self._state.step_count += 1
-        reward = 0.0
         logs: List[str] = []
         config_snapshot: Dict[str, Any] = {}
         api_response: Optional[Dict[str, Any]] = None
@@ -195,7 +196,9 @@ class ApiDebugEnvironment(Environment):
         """Return logs for a service and reward for relevant inspection."""
         assert self._scenario is not None
         logs = self._scenario.logs.get(target, [])
-        self._inspected_targets.add(f"logs:{target}")
         # Check if any unfound issues have log hints in these logs
         found_new = False
@@ -209,6 +212,9 @@ class ApiDebugEnvironment(Environment):
         if found_new:
             reward = 0.15
             self._last_action_result = f"Inspected logs for '{target}'. Found relevant error patterns!"
         elif logs:
             reward = 0.05
             self._last_action_result = f"Inspected logs for '{target}'. {len(logs)} log entries found."
@@ -222,13 +228,22 @@ class ApiDebugEnvironment(Environment):
         """Return current config for a service."""
         assert self._scenario is not None
         config = self._current_configs.get(target, {})
-        self._inspected_targets.add(f"config:{target}")
-        # Small reward for inspecting a service that has issues
         has_issues = any(i.service == target for i in self._scenario.issues if i.issue_id not in self._issues_fixed)
-        reward = 0.05 if has_issues else 0.02
-        self._last_action_result = f"Inspected config for '{target}'. Configuration retrieved."
         return config, reward
     def _handle_inspect_endpoint(self, target: str) -> tuple:
@@ -310,33 +325,124 @@ class ApiDebugEnvironment(Environment):
     # ─── Helper Methods ───────────────────────────────────────────────────
     def _check_fix(self, issue: Issue, fix_payload: Dict[str, Any]) -> bool:
         """
         Check if a fix payload correctly addresses an issue.
-        Uses fuzzy matching — the fix is accepted if:
-        1. The fix_key is present in the payload, OR
-        2. Any expected_fix key is present in the payload with a reasonable value
         """
-        # Direct key match
         if issue.fix_key in fix_payload:
-            return True
         # Check nested key (e.g., "headers.Authorization" -> check payload for "Authorization")
         if "." in issue.fix_key:
             parts = issue.fix_key.split(".")
             leaf_key = parts[-1]
             if leaf_key in fix_payload:
                 return True
-        # Check expected fix keys
-        for key in issue.expected_fix:
             if key in fix_payload:
-                return True
             if "." in key:
                 leaf = key.split(".")[-1]
                 if leaf in fix_payload:
-                    return True
         return False

         self._last_action_result = ""
         self._cumulative_reward = 0.0
+    def reset(self, task_id: Optional[str] = None, seed: Optional[int] = None) -> ApiDebugObservation:
         """
         Reset the environment, optionally with a new task.
         Args:
             task_id: Override the task difficulty. One of 'easy', 'medium', 'hard'.
+            seed: Optional seed for reproducible randomized scenarios.
         Returns:
             Initial observation with task description and available targets.
             self._task_id = task_id
         self._state = State(episode_id=str(uuid4()), step_count=0)
+        self._scenario = get_scenario(self._task_id, seed=seed)
         self._current_configs = copy.deepcopy(self._scenario.configs)
         self._issues_found = set()
         self._issues_fixed = set()
         assert self._scenario is not None  # for type checker
         self._state.step_count += 1
+        reward = -0.01  # Small step cost to encourage efficiency
         logs: List[str] = []
         config_snapshot: Dict[str, Any] = {}
         api_response: Optional[Dict[str, Any]] = None
         """Return logs for a service and reward for relevant inspection."""
         assert self._scenario is not None
         logs = self._scenario.logs.get(target, [])
+        inspect_key = f"logs:{target}"
+        is_repeat = inspect_key in self._inspected_targets
+        self._inspected_targets.add(inspect_key)
         # Check if any unfound issues have log hints in these logs
         found_new = False
         if found_new:
             reward = 0.15
             self._last_action_result = f"Inspected logs for '{target}'. Found relevant error patterns!"
+        elif is_repeat:
+            reward = 0.0  # No reward for re-inspecting same logs
+            self._last_action_result = f"Re-inspected logs for '{target}'. No new information."
         elif logs:
             reward = 0.05
             self._last_action_result = f"Inspected logs for '{target}'. {len(logs)} log entries found."
         """Return current config for a service."""
         assert self._scenario is not None
         config = self._current_configs.get(target, {})
+        inspect_key = f"config:{target}"
+        is_repeat = inspect_key in self._inspected_targets
+        self._inspected_targets.add(inspect_key)
+        # Reward based on relevance and novelty
         has_issues = any(i.service == target for i in self._scenario.issues if i.issue_id not in self._issues_fixed)
+        if is_repeat:
+            reward = 0.0  # No reward for re-inspecting same config
+            self._last_action_result = f"Re-inspected config for '{target}'. No changes since last check."
+        elif has_issues:
+            reward = 0.05
+            self._last_action_result = f"Inspected config for '{target}'. Configuration retrieved."
+        else:
+            reward = 0.01
+            self._last_action_result = f"Inspected config for '{target}'. No issues detected in this service."
         return config, reward
     def _handle_inspect_endpoint(self, target: str) -> tuple:
     # ─── Helper Methods ───────────────────────────────────────────────────
+    @staticmethod
+    def _normalize_value(value: Any) -> Any:
+        """Normalize a value for comparison (lowercase strings, sort lists, etc.)."""
+        if isinstance(value, str):
+            return value.strip().lower()
+        if isinstance(value, list):
+            return sorted([ApiDebugEnvironment._normalize_value(v) for v in value], key=str)
+        if isinstance(value, dict):
+            return {k: ApiDebugEnvironment._normalize_value(v) for k, v in value.items()}
+        return value
+    def _values_match(self, expected: Any, submitted: Any) -> bool:
+        """
+        Check if a submitted value matches the expected value.
+        Supports:
+        - Exact match
+        - Case-insensitive string match
+        - Numeric tolerance
+        - Boolean coercion (e.g., "true" -> True)
+        - List containment (submitted must contain all expected elements)
+        - Pattern match for token-like values (Bearer <anything> matches Bearer <token>)
+        """
+        # Normalize both
+        norm_expected = self._normalize_value(expected)
+        norm_submitted = self._normalize_value(submitted)
+        # Exact match after normalization
+        if norm_expected == norm_submitted:
+            return True
+        # Numeric comparison with tolerance
+        if isinstance(expected, (int, float)) and isinstance(submitted, (int, float)):
+            if expected == 0:
+                return submitted == 0
+            return abs(expected - submitted) / max(abs(expected), 1) < 0.25
+        # Boolean coercion
+        if isinstance(expected, bool):
+            if isinstance(submitted, str):
+                return submitted.lower() in ("true", "1", "yes") if expected else submitted.lower() in ("false", "0", "no")
+            return bool(submitted) == expected
+        # String pattern match for tokens: "Bearer <token>" matches "Bearer <anything>"
+        if isinstance(expected, str) and isinstance(submitted, str):
+            exp_lower = expected.strip().lower()
+            sub_lower = submitted.strip().lower()
+            # If expected has a placeholder like <token>, accept any non-empty value
+            if "<" in exp_lower and ">" in exp_lower:
+                prefix = exp_lower.split("<")[0].strip()
+                if prefix and sub_lower.startswith(prefix) and len(sub_lower) > len(prefix):
+                    return True
+            # If submitted has same prefix structure
+            if exp_lower.startswith("bearer ") and sub_lower.startswith("bearer "):
+                # Any valid bearer token is acceptable
+                return len(sub_lower) > len("bearer ")
+        # List: submitted must contain all expected elements
+        if isinstance(expected, list) and isinstance(submitted, list):
+            return all(any(self._values_match(e, s) for s in submitted) for e in expected)
+        return False
     def _check_fix(self, issue: Issue, fix_payload: Dict[str, Any]) -> bool:
         """
         Check if a fix payload correctly addresses an issue.
+        Validates both the key AND the value. The fix is accepted if:
+        1. The fix_key is present with a matching value, OR
+        2. An expected_fix key is present with a matching value
         """
+        # Direct key match with value validation
         if issue.fix_key in fix_payload:
+            expected_val = issue.expected_fix.get(issue.fix_key)
+            if expected_val is not None:
+                return self._values_match(expected_val, fix_payload[issue.fix_key])
+            # If the submitted value is a dict and expected_fix has nested keys,
+            # validate the nested key-value pairs inside the dict
+            submitted_val = fix_payload[issue.fix_key]
+            if isinstance(submitted_val, dict):
+                nested_prefix = issue.fix_key + "."
+                nested_expected = {
+                    k[len(nested_prefix):]: v
+                    for k, v in issue.expected_fix.items()
+                    if k.startswith(nested_prefix)
+                }
+                if nested_expected:
+                    # All nested expected keys must match
+                    return all(
+                        k in submitted_val and self._values_match(v, submitted_val[k])
+                        for k, v in nested_expected.items()
+                    )
+            return True  # Key exists, no expected value to validate against
         # Check nested key (e.g., "headers.Authorization" -> check payload for "Authorization")
         if "." in issue.fix_key:
             parts = issue.fix_key.split(".")
             leaf_key = parts[-1]
             if leaf_key in fix_payload:
+                expected_val = issue.expected_fix.get(issue.fix_key)
+                if expected_val is not None:
+                    return self._values_match(expected_val, fix_payload[leaf_key])
                 return True
+        # Check expected fix keys with value validation
+        for key, expected_val in issue.expected_fix.items():
+            # Direct key in payload
             if key in fix_payload:
+                if self._values_match(expected_val, fix_payload[key]):
+                    return True
+            # Nested key leaf match
             if "." in key:
                 leaf = key.split(".")[-1]
                 if leaf in fix_payload:
+                    if self._values_match(expected_val, fix_payload[leaf]):
+                        return True
         return False

server/app.py CHANGED Viewed

@@ -139,18 +139,38 @@ async def run_grader(request: GraderRequest):
 @app.post("/baseline")
-async def run_baseline(request: BaselineRequest):
     """
-    Run a simple rule-based baseline agent on all tasks.
     Returns baseline scores for each task.
     """
     results = {}
     for task_id in get_all_task_ids():
         env = ApiDebugEnvironment(task_id=task_id)
         obs = env.reset()
-        # Simple baseline strategy: inspect all logs, then all configs, then submit fixes
         for service in obs.available_targets:
             if env._done:
                 break
@@ -159,6 +179,7 @@ async def run_baseline(request: BaselineRequest):
                 target=service,
             ))
         for service in obs.available_targets:
             if env._done:
                 break
@@ -167,12 +188,14 @@ async def run_baseline(request: BaselineRequest):
                 target=service,
             ))
-        for service in obs.available_targets:
             if env._done:
                 break
             obs = env.step(ApiDebugAction(
-                action_type="inspect_endpoint",
-                target=service,
             ))
         # Store for grading

 @app.post("/baseline")
+async def run_baseline(request: Optional[BaselineRequest] = None):
     """
+    Run a rule-based baseline agent on all tasks.
+    The baseline inspects logs/configs and then submits known fixes.
     Returns baseline scores for each task.
     """
+    # Known fixes for each task (a heuristic baseline, not an LLM)
+    known_fixes = {
+        "easy": [
+            {"target": "payment_client", "fix": {"headers.Authorization": "Bearer sk_live_token123", "headers.Content-Type": "application/json"}},
+        ],
+        "medium": [
+            {"target": "webhook_sender", "fix": {"rate_limit.requests_per_second": 10}},
+            {"target": "webhook_sender", "fix": {"retry": {"max_retries": 3, "backoff_factor": 2, "retry_on_status": [429, 500]}}},
+            {"target": "webhook_sender", "fix": {"headers.X-Webhook-Signature": "sha256=computed_signature"}},
+        ],
+        "hard": [
+            {"target": "order_service", "fix": {"inventory_url": "https://inventory.internal/v2/reserve"}},
+            {"target": "order_service", "fix": {"timeout": 10}},
+            {"target": "order_service", "fix": {"async_mode": True}},
+            {"target": "inventory_service", "fix": {"headers.Authorization": "Bearer valid_token_789"}},
+            {"target": "inventory_service", "fix": {"token_refresh_url": "https://auth.internal/refresh", "auto_refresh": True}},
+        ],
+    }
     results = {}
     for task_id in get_all_task_ids():
         env = ApiDebugEnvironment(task_id=task_id)
         obs = env.reset()
+        # Phase 1: Inspect all logs
         for service in obs.available_targets:
             if env._done:
                 break
                 target=service,
             ))
+        # Phase 2: Inspect all configs
         for service in obs.available_targets:
             if env._done:
                 break
                 target=service,
             ))
+        # Phase 3: Submit fixes
+        for fix_info in known_fixes.get(task_id, []):
             if env._done:
                 break
             obs = env.step(ApiDebugAction(
+                action_type="submit_fix",
+                target=fix_info["target"],
+                fix_payload=fix_info["fix"],
             ))
         # Store for grading

tests/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Tests package

tests/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (165 Bytes). View file

tests/__pycache__/test_environment.cpython-313-pytest-8.4.1.pyc ADDED Viewed

Binary file (66.9 kB). View file

tests/test_environment.py ADDED Viewed

	@@ -0,0 +1,464 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+"""
+Comprehensive tests for the API Integration Debugging Environment.
+Tests cover:
+- Environment reset and initialization
+- Action handling (inspect_logs, inspect_config, inspect_endpoint, submit_fix)
+- Grading formula correctness
+- Fix validation (strict value matching)
+- Episode termination conditions
+- Repeated inspection penalty
+- Seed-based reproducibility
+"""
+import sys
+import os
+import pytest
+# Add parent directory to path
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+from models import ApiDebugAction, ApiDebugObservation
+from server.api_debug_env_environment import ApiDebugEnvironment
+from scenarios import get_scenario, get_all_task_ids, Issue
+# ─── Scenario Tests ──────────────────────────────────────────────────────────
+class TestScenarios:
+    """Test scenario loading and configuration."""
+    def test_all_task_ids_returns_three(self):
+        task_ids = get_all_task_ids()
+        assert task_ids == ["easy", "medium", "hard"]
+    @pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
+    def test_scenario_loads(self, task_id):
+        scenario = get_scenario(task_id)
+        assert scenario.task_id == task_id
+        assert len(scenario.issues) > 0
+        assert len(scenario.services) > 0
+        assert scenario.max_steps > 0
+    def test_invalid_task_id_raises(self):
+        with pytest.raises(ValueError, match="Unknown task_id"):
+            get_scenario("nonexistent")
+    def test_easy_has_two_issues(self):
+        s = get_scenario("easy")
+        assert len(s.issues) == 2
+    def test_medium_has_three_issues(self):
+        s = get_scenario("medium")
+        assert len(s.issues) == 3
+    def test_hard_has_five_issues(self):
+        s = get_scenario("hard")
+        assert len(s.issues) == 5
+    def test_seed_randomization_shuffles_logs(self):
+        """Same seed should produce same order, different seed different order."""
+        s1 = get_scenario("easy", seed=42)
+        s2 = get_scenario("easy", seed=42)
+        s3 = get_scenario("easy", seed=99)
+        # Same seed = same log order
+        for service in s1.services:
+            assert s1.logs.get(service) == s2.logs.get(service)
+        # Different seed = potentially different order (may be same by chance,
+        # but with enough log entries, it's unlikely)
+        # We just verify it doesn't crash
+        assert s3 is not None
+    def test_each_issue_has_log_hint(self):
+        """Every issue should have a corresponding log hint findable in the logs."""
+        for task_id in get_all_task_ids():
+            s = get_scenario(task_id)
+            for issue in s.issues:
+                found = False
+                for service_logs in s.logs.values():
+                    for log_line in service_logs:
+                        if issue.log_hint in log_line:
+                            found = True
+                            break
+                    if found:
+                        break
+                assert found, f"Issue {issue.issue_id} log_hint '{issue.log_hint}' not found in any logs"
+# ─── Environment Reset Tests ─────────────────────────────────────────────────
+class TestEnvironmentReset:
+    """Test environment initialization and reset."""
+    def test_reset_returns_observation(self):
+        env = ApiDebugEnvironment(task_id="easy")
+        obs = env.reset()
+        assert isinstance(obs, ApiDebugObservation)
+    def test_reset_clears_state(self):
+        env = ApiDebugEnvironment(task_id="easy")
+        obs = env.reset()
+        assert obs.issues_found == 0
+        assert obs.issues_fixed == 0
+        assert obs.done is False
+        assert obs.remaining_steps == 15  # easy max_steps
+    def test_reset_provides_available_targets(self):
+        env = ApiDebugEnvironment(task_id="easy")
+        obs = env.reset()
+        assert len(obs.available_targets) > 0
+        assert "payment_client" in obs.available_targets
+    def test_reset_with_different_task(self):
+        env = ApiDebugEnvironment(task_id="easy")
+        obs = env.reset(task_id="hard")
+        assert obs.issues_total == 5
+    def test_initial_reward_is_zero(self):
+        env = ApiDebugEnvironment(task_id="easy")
+        obs = env.reset()
+        assert obs.reward == 0.0
+# ─── Action Handler Tests ────────────────────────────────────────────────────
+class TestInspectLogs:
+    """Test inspect_logs action."""
+    def test_inspect_logs_returns_logs(self):
+        env = ApiDebugEnvironment(task_id="easy")
+        env.reset()
+        obs = env.step(ApiDebugAction(
+            action_type="inspect_logs",
+            target="payment_client",
+        ))
+        assert len(obs.logs) > 0
+    def test_inspect_logs_finds_issues(self):
+        env = ApiDebugEnvironment(task_id="easy")
+        env.reset()
+        obs = env.step(ApiDebugAction(
+            action_type="inspect_logs",
+            target="payment_client",
+        ))
+        assert obs.issues_found > 0
+        assert obs.reward > 0  # Should get positive reward for finding issues
+    def test_repeated_inspect_logs_no_reward(self):
+        """Second inspection of same target should give 0 reward."""
+        env = ApiDebugEnvironment(task_id="easy")
+        env.reset()
+        # First inspection
+        obs1 = env.step(ApiDebugAction(
+            action_type="inspect_logs",
+            target="payment_client",
+        ))
+        # Second inspection (repeat)
+        obs2 = env.step(ApiDebugAction(
+            action_type="inspect_logs",
+            target="payment_client",
+        ))
+        # The step cost is -0.01, repeat inspect gives 0 + (-0.01) base
+        assert obs2.reward < obs1.reward
+class TestInspectConfig:
+    """Test inspect_config action."""
+    def test_inspect_config_returns_config(self):
+        env = ApiDebugEnvironment(task_id="easy")
+        env.reset()
+        obs = env.step(ApiDebugAction(
+            action_type="inspect_config",
+            target="payment_client",
+        ))
+        assert len(obs.config_snapshot) > 0
+        assert "headers" in obs.config_snapshot
+class TestInspectEndpoint:
+    """Test inspect_endpoint action."""
+    def test_inspect_endpoint_shows_error(self):
+        env = ApiDebugEnvironment(task_id="easy")
+        env.reset()
+        obs = env.step(ApiDebugAction(
+            action_type="inspect_endpoint",
+            target="payment_client",
+        ))
+        assert obs.api_response is not None
+        assert obs.api_response["status"] == "error"
+class TestSubmitFix:
+    """Test submit_fix action with value validation."""
+    def test_correct_fix_accepted(self):
+        """Submitting the right key AND value should be accepted."""
+        env = ApiDebugEnvironment(task_id="easy")
+        env.reset()
+        obs = env.step(ApiDebugAction(
+            action_type="submit_fix",
+            target="payment_client",
+            fix_payload={"headers.Content-Type": "application/json"},
+        ))
+        assert obs.issues_fixed > 0
+        assert "accepted" in obs.action_result.lower() or "fixed" in obs.action_result.lower()
+    def test_wrong_value_rejected(self):
+        """Right key but wrong value should be rejected."""
+        env = ApiDebugEnvironment(task_id="easy")
+        env.reset()
+        obs = env.step(ApiDebugAction(
+            action_type="submit_fix",
+            target="payment_client",
+            fix_payload={"headers.Content-Type": "text/xml"},  # Wrong value!
+        ))
+        assert obs.issues_fixed == 0
+        assert obs.reward < 0  # Should get negative reward
+    def test_correct_auth_fix(self):
+        """Bearer token fix should work with any valid token."""
+        env = ApiDebugEnvironment(task_id="easy")
+        env.reset()
+        obs = env.step(ApiDebugAction(
+            action_type="submit_fix",
+            target="payment_client",
+            fix_payload={"headers.Authorization": "Bearer my_actual_api_key_123"},
+        ))
+        assert obs.issues_fixed > 0
+    def test_empty_payload_rejected(self):
+        env = ApiDebugEnvironment(task_id="easy")
+        env.reset()
+        obs = env.step(ApiDebugAction(
+            action_type="submit_fix",
+            target="payment_client",
+            fix_payload={},
+        ))
+        assert obs.reward < 0
+    def test_invalid_target_penalized(self):
+        env = ApiDebugEnvironment(task_id="easy")
+        env.reset()
+        obs = env.step(ApiDebugAction(
+            action_type="submit_fix",
+            target="nonexistent_service",
+            fix_payload={"key": "value"},
+        ))
+        assert obs.reward < 0
+    def test_fix_all_issues_completes_episode(self):
+        """Fixing all issues should mark episode as done with completion bonus."""
+        env = ApiDebugEnvironment(task_id="easy")
+        env.reset()
+        # Fix auth
+        env.step(ApiDebugAction(
+            action_type="submit_fix",
+            target="payment_client",
+            fix_payload={"headers.Authorization": "Bearer valid_token_123"},
+        ))
+        # Fix content-type
+        obs = env.step(ApiDebugAction(
+            action_type="submit_fix",
+            target="payment_client",
+            fix_payload={"headers.Content-Type": "application/json"},
+        ))
+        assert obs.done is True
+        assert obs.issues_fixed == 2
+# ─── Grading Tests ────────────────────────────────────────────────────────────
+class TestGrading:
+    """Test the grading formula."""
+    def test_grade_no_fixes_is_low(self):
+        """Grade with no fixes should be very low (just exploration bonus)."""
+        env = ApiDebugEnvironment(task_id="easy")
+        env.reset()
+        env.step(ApiDebugAction(action_type="inspect_logs", target="payment_client"))
+        score = env.grade()
+        assert 0.0 < score < 0.1  # Exploration bonus only
+    def test_grade_all_fixes_is_high(self):
+        """Grade with all fixes should be high."""
+        env = ApiDebugEnvironment(task_id="easy")
+        env.reset()
+        env.step(ApiDebugAction(
+            action_type="submit_fix",
+            target="payment_client",
+            fix_payload={"headers.Authorization": "Bearer valid_token_123"},
+        ))
+        env.step(ApiDebugAction(
+            action_type="submit_fix",
+            target="payment_client",
+            fix_payload={"headers.Content-Type": "application/json"},
+        ))
+        score = env.grade()
+        assert score > 0.8  # Should be high with efficiency bonus
+    def test_grade_strictly_between_0_and_1(self):
+        """Grade must be strictly in (0, 1), never exactly 0.0 or 1.0."""
+        for task_id in get_all_task_ids():
+            env = ApiDebugEnvironment(task_id=task_id)
+            env.reset()
+            score = env.grade()
+            assert 0.0 < score < 1.0, f"Score for {task_id} was {score}"
+    def test_efficiency_bonus(self):
+        """Faster solutions should score higher."""
+        # Quick partial solve (1 step, fix 1 of 2 issues)
+        env1 = ApiDebugEnvironment(task_id="easy")
+        env1.reset()
+        env1.step(ApiDebugAction(
+            action_type="submit_fix",
+            target="payment_client",
+            fix_payload={"headers.Content-Type": "application/json"},
+        ))
+        score_fast = env1.grade()
+        # Slow partial solve (many inspection steps, then fix same 1 issue)
+        env2 = ApiDebugEnvironment(task_id="easy")
+        env2.reset()
+        for _ in range(10):
+            env2.step(ApiDebugAction(action_type="inspect_logs", target="payment_client"))
+        env2.step(ApiDebugAction(
+            action_type="submit_fix",
+            target="payment_client",
+            fix_payload={"headers.Content-Type": "application/json"},
+        ))
+        score_slow = env2.grade()
+        assert score_fast > score_slow, f"Fast={score_fast} should beat Slow={score_slow}"
+# ─── Episode Termination Tests ────────────────────────────────────────────────
+class TestEpisodeTermination:
+    """Test episode ending conditions."""
+    def test_out_of_steps_ends_episode(self):
+        env = ApiDebugEnvironment(task_id="easy")
+        env.reset()
+        # Take max_steps actions
+        for _ in range(15):
+            obs = env.step(ApiDebugAction(
+                action_type="inspect_logs",
+                target="payment_client",
+            ))
+        assert obs.done is True
+        assert obs.remaining_steps == 0
+    def test_invalid_action_type_penalized(self):
+        env = ApiDebugEnvironment(task_id="easy")
+        env.reset()
+        obs = env.step(ApiDebugAction(
+            action_type="nonexistent_action",
+            target="payment_client",
+        ))
+        assert obs.reward < 0
+# ─── Value Matching Tests ─────────────────────────────────────────────────────
+class TestValueMatching:
+    """Test the _values_match method directly."""
+    def setup_method(self):
+        self.env = ApiDebugEnvironment(task_id="easy")
+    def test_exact_string_match(self):
+        assert self.env._values_match("application/json", "application/json")
+    def test_case_insensitive_match(self):
+        assert self.env._values_match("Application/JSON", "application/json")
+    def test_numeric_exact(self):
+        assert self.env._values_match(10, 10)
+    def test_numeric_tolerance(self):
+        assert self.env._values_match(10, 9)  # Within 25%
+        assert not self.env._values_match(10, 5)  # Outside 25%
+    def test_boolean_match(self):
+        assert self.env._values_match(True, True)
+        assert not self.env._values_match(True, False)
+    def test_boolean_from_string(self):
+        assert self.env._values_match(True, "true")
+        assert self.env._values_match(False, "false")
+    def test_list_containment(self):
+        assert self.env._values_match([429, 500], [429, 500])
+        assert self.env._values_match([429, 500], [500, 429, 502])
+    def test_bearer_token_pattern(self):
+        assert self.env._values_match("Bearer <token>", "Bearer my_secret_key")
+        assert not self.env._values_match("Bearer <token>", "Bearer ")  # Empty token
+    def test_wrong_value_rejected(self):
+        assert not self.env._values_match("application/json", "text/xml")
+        assert not self.env._values_match(10, 100)
+# ─── Integration Test ─────────────────────────────────────────────────────────
+class TestFullEpisode:
+    """Test a complete episode flow."""
+    def test_easy_full_solve(self):
+        """Run a complete easy episode from start to finish."""
+        env = ApiDebugEnvironment(task_id="easy")
+        obs = env.reset()
+        # Step 1: Inspect logs
+        obs = env.step(ApiDebugAction(
+            action_type="inspect_logs",
+            target="payment_client",
+        ))
+        assert obs.issues_found >= 1
+        # Step 2: Inspect config
+        obs = env.step(ApiDebugAction(
+            action_type="inspect_config",
+            target="payment_client",
+        ))
+        assert "headers" in obs.config_snapshot
+        # Step 3: Fix auth
+        obs = env.step(ApiDebugAction(
+            action_type="submit_fix",
+            target="payment_client",
+            fix_payload={"headers.Authorization": "Bearer my_token_123"},
+        ))
+        assert obs.issues_fixed >= 1
+        # Step 4: Fix content-type
+        obs = env.step(ApiDebugAction(
+            action_type="submit_fix",
+            target="payment_client",
+            fix_payload={"headers.Content-Type": "application/json"},
+        ))
+        assert obs.issues_fixed == 2
+        assert obs.done is True
+        # Grade
+        score = env.grade()
+        assert score > 0.8
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])