AgentDebugger-training-v3 / tests /test_integration.py
shank
Update: Made refinements to the project
159a5fa
"""
AgentDebuggerEnv — Integration Tests
====================================
Verifies the full episode lifecycle: reset -> step -> end.
Assumes the server is available via the DebuggerEnvironment class directly
(testing the logic, not the HTTP layer which is just a thin wrapper).
"""
import pytest
from env.environment import DebuggerEnvironment
from env.models import Action
def test_full_episode_easy():
"""Test a full successful episode on the 'easy' task."""
env = DebuggerEnvironment()
# 1. Reset
obs = env.reset("easy")
assert obs["task_id"] == "easy"
assert obs["done"] is False
assert obs["tests_passed"] < obs["tests_total"]
# 2. Submit a fix (using known ground truth)
# The easy task is binary search with 'left < right' instead of 'left <= right'
ground_truth_code = """
def binary_search(arr, target):
left, right = 0, len(arr) - 1
while left <= right:
mid = (left + right) // 2
if arr[mid] == target:
return mid
elif arr[mid] < target:
left = mid + 1
else:
right = mid - 1
return -1
"""
action = Action(
action_type="submit_fix",
fixed_code=ground_truth_code,
hypothesis="Binary search termination condition should be left <= right to include all elements."
)
result = env.step(action)
# 3. Verify results
assert result["done"] is True
assert result["observation"]["tests_passed"] == result["observation"]["tests_total"]
assert result["reward"]["grader_score"] > 0.80
def test_query_hint_system():
"""Test the newly added hint system."""
env = DebuggerEnvironment()
env.reset("hard")
action = Action(
action_type="query_context",
query_type="test_suggestion"
)
result = env.step(action)
assert "concurrent threads" in result["info"]["query_result"]
assert result["reward"]["step_reward"] == 0.0 # First query is free
def test_hard_grader_consensus():
"""
Test that the hard grader runs multiple times.
(We mock execute_code to simulate flakiness).
"""
from unittest.mock import patch
from env.graders.grader_hard import HardGrader
grader = HardGrader()
# Mock execute_code to return success 3/5 times
# Sequence: PASS, FAIL, PASS, FAIL, PASS
with patch("env.graders.grader_hard.execute_code") as mock_exec:
mock_exec.side_effect = [
("CONCURRENT PASS", False, 100),
("CONCURRENT FAIL", False, 100),
("CONCURRENT PASS", False, 100),
("CONCURRENT FAIL", False, 100),
("CONCURRENT PASS", False, 100),
]
score = grader.score(
task_config={"task_id": "hard", "ground_truth": {"hypothesis_keywords": ["race"]}},
attempts=[{"tests_passed": 8, "attempt_number": 1, "code_submitted": "..."}],
best_tests_passed=8,
tests_total=8,
attempts_used=1,
max_attempts=10,
hypotheses=["race condition"]
)
# 3/5 passes → should get partial credit (0.15) for concurrency
# Sequential: 1.0 * 0.40 = 0.40
# Concurrency: 0.15
# Hypothesis: 1.0 * 0.20 = 0.20
# Efficiency: (concurrent_score == 0.30) is False -> 0.0
# Total: 0.75
assert score == 0.75