AgentDebugger-training-v3 / tests /test_graders.py
shank
complete project
0ee66d2
"""
Tests for graders β€” determinism and range validation.
"""
import pytest
from env.graders import get_grader
from env.tasks.registry import get_task
# ── Determinism Tests ────────────────────────────────────────────────────────
def _make_dummy_attempts(n=2, tests_passed=3, tests_total=8):
"""Create dummy attempt data for testing."""
return [
{
"attempt_number": i + 1,
"code_submitted": "def dummy(): pass",
"hypothesis": "The bug is in the loop condition",
"execution_output": f"{tests_passed} passed, {tests_total - tests_passed} failed",
"tests_passed": tests_passed,
"tests_total": tests_total,
"execution_time_ms": 100,
"timed_out": False,
}
for i in range(n)
]
def test_easy_grader_deterministic():
"""Same input to easy grader must produce same output."""
grader = get_grader("easy")
task = get_task("easy")
attempts = _make_dummy_attempts(2, tests_passed=7, tests_total=8)
hypotheses = ["The off by one error in the loop condition"]
score1 = grader.score(task, attempts, 7, 8, 2, 5, hypotheses)
score2 = grader.score(task, attempts, 7, 8, 2, 5, hypotheses)
assert score1 == score2, f"Easy grader not deterministic: {score1} != {score2}"
def test_medium_grader_deterministic():
"""Same input to medium grader must produce same output."""
grader = get_grader("medium")
task = get_task("medium")
attempts = _make_dummy_attempts(3, tests_passed=6, tests_total=10)
hypotheses = ["Bug is in hash_password bytes conversion"]
score1 = grader.score(task, attempts, 6, 10, 3, 7, hypotheses)
score2 = grader.score(task, attempts, 6, 10, 3, 7, hypotheses)
assert score1 == score2, f"Medium grader not deterministic: {score1} != {score2}"
def test_hard_grader_deterministic():
"""Same input to hard grader must produce same output (excluding concurrent test randomness)."""
grader = get_grader("hard")
task = get_task("hard")
# Use buggy code so concurrent test is deterministically failing
attempts = _make_dummy_attempts(2, tests_passed=8, tests_total=8)
hypotheses = ["race condition in increment"]
score1 = grader.score(task, attempts, 8, 8, 2, 10, hypotheses)
score2 = grader.score(task, attempts, 8, 8, 2, 10, hypotheses)
assert score1 == score2, f"Hard grader not deterministic: {score1} != {score2}"
# ── Range Tests ──────────────────────────────────────────────────────────────
@pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
def test_grader_range_with_zero_attempts(task_id):
"""Grader with zero attempts should return a score in [0.0, 1.0]."""
grader = get_grader(task_id)
task = get_task(task_id)
score = grader.score(task, [], 0, task["tests_total"], 0, task["max_attempts"], [])
assert 0.0 <= score <= 1.0, f"{task_id} grader out of range: {score}"
@pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
def test_grader_range_with_perfect_score(task_id):
"""Grader with all tests passing should return a score in [0.0, 1.0]."""
grader = get_grader(task_id)
task = get_task(task_id)
tests_total = task["tests_total"]
attempts = _make_dummy_attempts(1, tests_passed=tests_total, tests_total=tests_total)
hypotheses = ["off by one", "hash_password bytes", "race condition atomic lock"]
score = grader.score(task, attempts, tests_total, tests_total, 1, task["max_attempts"], hypotheses)
assert 0.0 <= score <= 1.0, f"{task_id} grader out of range: {score}"
@pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
def test_grader_range_with_all_failures(task_id):
"""Grader with no tests passing should return a score in [0.0, 1.0]."""
grader = get_grader(task_id)
task = get_task(task_id)
tests_total = task["tests_total"]
attempts = _make_dummy_attempts(task["max_attempts"], tests_passed=0, tests_total=tests_total)
score = grader.score(task, attempts, 0, tests_total, task["max_attempts"], task["max_attempts"], [])
assert 0.0 <= score <= 1.0, f"{task_id} grader out of range: {score}"
# ── Variance Tests (dummy vs perfect agents) ────────────────────────────────
def test_easy_dummy_agent_low_score():
"""A dummy agent submitting 'pass' should score < 0.15."""
grader = get_grader("easy")
task = get_task("easy")
attempts = [
{
"attempt_number": i + 1,
"code_submitted": "pass",
"hypothesis": "I don't know",
"execution_output": "0 passed, 8 failed",
"tests_passed": 0,
"tests_total": 8,
"execution_time_ms": 50,
"timed_out": False,
}
for i in range(5)
]
score = grader.score(task, attempts, 0, 8, 5, 5, ["I don't know"] * 5)
assert score < 0.15, f"Dummy agent scored too high on easy: {score}"
def test_easy_perfect_agent_high_score():
"""A perfect agent should score > 0.85 on easy."""
grader = get_grader("easy")
task = get_task("easy")
attempts = [
{
"attempt_number": 1,
"code_submitted": task["ground_truth"]["fixed_code"],
"hypothesis": "The off by one error: should be left <= right",
"execution_output": "8 passed, 0 failed",
"tests_passed": 8,
"tests_total": 8,
"execution_time_ms": 50,
"timed_out": False,
}
]
score = grader.score(task, attempts, 8, 8, 1, 5, ["The off by one error: should be left <= right"])
assert score > 0.85, f"Perfect agent scored too low on easy: {score}"
def test_medium_red_herring_low_score():
"""Agent that only fixes authenticate_user should score < 0.30 on hypothesis."""
grader = get_grader("medium")
task = get_task("medium")
attempts = _make_dummy_attempts(3, tests_passed=6, tests_total=10)
hypotheses = [
"The bug is in authenticate_user, it's not checking credentials correctly",
"authenticate_user should handle the case differently",
"Fix authenticate_user to return True for valid users",
]
score = grader.score(task, attempts, 6, 10, 3, 7, hypotheses)
# With only 6/10 tests and red herring hypotheses, score should be modest
assert score < 0.60, f"Red herring agent scored too high on medium: {score}"