Spaces:

agentDebugger
/

AgentDebugger-training-v3

Running

AgentDebugger-training-v3 / tests /test_integration.py

shank

Update: Made refinements to the project

159a5fa 25 days ago

3.4 kB

	"""
	AgentDebuggerEnv — Integration Tests
	====================================
	Verifies the full episode lifecycle: reset -> step -> end.
	Assumes the server is available via the DebuggerEnvironment class directly
	(testing the logic, not the HTTP layer which is just a thin wrapper).
	"""

	import pytest
	from env.environment import DebuggerEnvironment
	from env.models import Action

	def test_full_episode_easy():
	"""Test a full successful episode on the 'easy' task."""
	env = DebuggerEnvironment()

	# 1. Reset
	obs = env.reset("easy")
	assert obs["task_id"] == "easy"
	assert obs["done"] is False
	assert obs["tests_passed"] < obs["tests_total"]

	# 2. Submit a fix (using known ground truth)
	# The easy task is binary search with 'left < right' instead of 'left <= right'
	ground_truth_code = """
	def binary_search(arr, target):
	left, right = 0, len(arr) - 1
	while left <= right:
	mid = (left + right) // 2
	if arr[mid] == target:
	return mid
	elif arr[mid] < target:
	left = mid + 1
	else:
	right = mid - 1
	return -1
	"""
	action = Action(
	action_type="submit_fix",
	fixed_code=ground_truth_code,
	hypothesis="Binary search termination condition should be left <= right to include all elements."
	)

	result = env.step(action)

	# 3. Verify results
	assert result["done"] is True
	assert result["observation"]["tests_passed"] == result["observation"]["tests_total"]
	assert result["reward"]["grader_score"] > 0.80

	def test_query_hint_system():
	"""Test the newly added hint system."""
	env = DebuggerEnvironment()
	env.reset("hard")

	action = Action(
	action_type="query_context",
	query_type="test_suggestion"
	)

	result = env.step(action)
	assert "concurrent threads" in result["info"]["query_result"]
	assert result["reward"]["step_reward"] == 0.0 # First query is free

	def test_hard_grader_consensus():
	"""
	Test that the hard grader runs multiple times.
	(We mock execute_code to simulate flakiness).
	"""
	from unittest.mock import patch
	from env.graders.grader_hard import HardGrader

	grader = HardGrader()

	# Mock execute_code to return success 3/5 times
	# Sequence: PASS, FAIL, PASS, FAIL, PASS
	with patch("env.graders.grader_hard.execute_code") as mock_exec:
	mock_exec.side_effect = [
	("CONCURRENT PASS", False, 100),
	("CONCURRENT FAIL", False, 100),
	("CONCURRENT PASS", False, 100),
	("CONCURRENT FAIL", False, 100),
	("CONCURRENT PASS", False, 100),
	]

	score = grader.score(
	task_config={"task_id": "hard", "ground_truth": {"hypothesis_keywords": ["race"]}},
	attempts=[{"tests_passed": 8, "attempt_number": 1, "code_submitted": "..."}],
	best_tests_passed=8,
	tests_total=8,
	attempts_used=1,
	max_attempts=10,
	hypotheses=["race condition"]
	)

	# 3/5 passes → should get partial credit (0.15) for concurrency
	# Sequential: 1.0 * 0.40 = 0.40
	# Concurrency: 0.15
	# Hypothesis: 1.0 * 0.20 = 0.20
	# Efficiency: (concurrent_score == 0.30) is False -> 0.0
	# Total: 0.75
	assert score == 0.75