Spaces:

agentDebugger
/

AgentDebugger-training-v3

Running

AgentDebugger-training-v3 / tests /test_graders.py

shank

complete project

0ee66d2 about 1 month ago

6.71 kB

	"""
	Tests for graders — determinism and range validation.
	"""

	import pytest
	from env.graders import get_grader
	from env.tasks.registry import get_task


	# ── Determinism Tests ────────────────────────────────────────────────────────

	def _make_dummy_attempts(n=2, tests_passed=3, tests_total=8):
	"""Create dummy attempt data for testing."""
	return [
	{
	"attempt_number": i + 1,
	"code_submitted": "def dummy(): pass",
	"hypothesis": "The bug is in the loop condition",
	"execution_output": f"{tests_passed} passed, {tests_total - tests_passed} failed",
	"tests_passed": tests_passed,
	"tests_total": tests_total,
	"execution_time_ms": 100,
	"timed_out": False,
	}
	for i in range(n)
	]


	def test_easy_grader_deterministic():
	"""Same input to easy grader must produce same output."""
	grader = get_grader("easy")
	task = get_task("easy")
	attempts = _make_dummy_attempts(2, tests_passed=7, tests_total=8)
	hypotheses = ["The off by one error in the loop condition"]

	score1 = grader.score(task, attempts, 7, 8, 2, 5, hypotheses)
	score2 = grader.score(task, attempts, 7, 8, 2, 5, hypotheses)
	assert score1 == score2, f"Easy grader not deterministic: {score1} != {score2}"


	def test_medium_grader_deterministic():
	"""Same input to medium grader must produce same output."""
	grader = get_grader("medium")
	task = get_task("medium")
	attempts = _make_dummy_attempts(3, tests_passed=6, tests_total=10)
	hypotheses = ["Bug is in hash_password bytes conversion"]

	score1 = grader.score(task, attempts, 6, 10, 3, 7, hypotheses)
	score2 = grader.score(task, attempts, 6, 10, 3, 7, hypotheses)
	assert score1 == score2, f"Medium grader not deterministic: {score1} != {score2}"


	def test_hard_grader_deterministic():
	"""Same input to hard grader must produce same output (excluding concurrent test randomness)."""
	grader = get_grader("hard")
	task = get_task("hard")
	# Use buggy code so concurrent test is deterministically failing
	attempts = _make_dummy_attempts(2, tests_passed=8, tests_total=8)
	hypotheses = ["race condition in increment"]

	score1 = grader.score(task, attempts, 8, 8, 2, 10, hypotheses)
	score2 = grader.score(task, attempts, 8, 8, 2, 10, hypotheses)
	assert score1 == score2, f"Hard grader not deterministic: {score1} != {score2}"


	# ── Range Tests ──────────────────────────────────────────────────────────────

	@pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
	def test_grader_range_with_zero_attempts(task_id):
	"""Grader with zero attempts should return a score in [0.0, 1.0]."""
	grader = get_grader(task_id)
	task = get_task(task_id)
	score = grader.score(task, [], 0, task["tests_total"], 0, task["max_attempts"], [])
	assert 0.0 <= score <= 1.0, f"{task_id} grader out of range: {score}"


	@pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
	def test_grader_range_with_perfect_score(task_id):
	"""Grader with all tests passing should return a score in [0.0, 1.0]."""
	grader = get_grader(task_id)
	task = get_task(task_id)
	tests_total = task["tests_total"]
	attempts = _make_dummy_attempts(1, tests_passed=tests_total, tests_total=tests_total)
	hypotheses = ["off by one", "hash_password bytes", "race condition atomic lock"]

	score = grader.score(task, attempts, tests_total, tests_total, 1, task["max_attempts"], hypotheses)
	assert 0.0 <= score <= 1.0, f"{task_id} grader out of range: {score}"


	@pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
	def test_grader_range_with_all_failures(task_id):
	"""Grader with no tests passing should return a score in [0.0, 1.0]."""
	grader = get_grader(task_id)
	task = get_task(task_id)
	tests_total = task["tests_total"]
	attempts = _make_dummy_attempts(task["max_attempts"], tests_passed=0, tests_total=tests_total)

	score = grader.score(task, attempts, 0, tests_total, task["max_attempts"], task["max_attempts"], [])
	assert 0.0 <= score <= 1.0, f"{task_id} grader out of range: {score}"


	# ── Variance Tests (dummy vs perfect agents) ────────────────────────────────

	def test_easy_dummy_agent_low_score():
	"""A dummy agent submitting 'pass' should score < 0.15."""
	grader = get_grader("easy")
	task = get_task("easy")
	attempts = [
	{
	"attempt_number": i + 1,
	"code_submitted": "pass",
	"hypothesis": "I don't know",
	"execution_output": "0 passed, 8 failed",
	"tests_passed": 0,
	"tests_total": 8,
	"execution_time_ms": 50,
	"timed_out": False,
	}
	for i in range(5)
	]
	score = grader.score(task, attempts, 0, 8, 5, 5, ["I don't know"] * 5)
	assert score < 0.15, f"Dummy agent scored too high on easy: {score}"


	def test_easy_perfect_agent_high_score():
	"""A perfect agent should score > 0.85 on easy."""
	grader = get_grader("easy")
	task = get_task("easy")
	attempts = [
	{
	"attempt_number": 1,
	"code_submitted": task["ground_truth"]["fixed_code"],
	"hypothesis": "The off by one error: should be left <= right",
	"execution_output": "8 passed, 0 failed",
	"tests_passed": 8,
	"tests_total": 8,
	"execution_time_ms": 50,
	"timed_out": False,
	}
	]
	score = grader.score(task, attempts, 8, 8, 1, 5, ["The off by one error: should be left <= right"])
	assert score > 0.85, f"Perfect agent scored too low on easy: {score}"


	def test_medium_red_herring_low_score():
	"""Agent that only fixes authenticate_user should score < 0.30 on hypothesis."""
	grader = get_grader("medium")
	task = get_task("medium")
	attempts = _make_dummy_attempts(3, tests_passed=6, tests_total=10)
	hypotheses = [
	"The bug is in authenticate_user, it's not checking credentials correctly",
	"authenticate_user should handle the case differently",
	"Fix authenticate_user to return True for valid users",
	]
	score = grader.score(task, attempts, 6, 10, 3, 7, hypotheses)
	# With only 6/10 tests and red herring hypotheses, score should be modest
	assert score < 0.60, f"Red herring agent scored too high on medium: {score}"