Spaces:

agentDebugger
/

AgentDebugger-training-v3

Running

App Files Files Community

shank commited on 29 days ago

Commit

6318243

1 Parent(s): 30f698e

Added testing files

Browse files

Files changed (3) hide show

env/tasks/task_easy.py +115 -0
env/tasks/task_hard.py +185 -0
env/tasks/task_medium.py +254 -0

env/tasks/task_easy.py ADDED Viewed

	@@ -0,0 +1,115 @@

+"""
+Task Easy — Binary Search Off-By-One Bug
+==========================================
+Single function, one clear bug. The termination condition uses `<` instead of `<=`,
+causing the function to miss the target when it's the last element.
+Expected: 7 pass, 1 fail (test_finds_last_element)
+"""
+TASK_DESCRIPTION = """A utility module for a data processing pipeline contains a binary search function.
+The function searches for a target value in a sorted list and returns its index, or -1 if not found.
+One of the tests is failing — the function is not returning the correct result in all cases.
+Your job is to identify the bug, form a hypothesis about the root cause, and fix it."""
+BUGGY_CODE = '''def binary_search(arr: list, target: int) -> int:
+    """Return the index of target in sorted arr, or -1 if not found."""
+    left, right = 0, len(arr) - 1
+    while left < right:          # BUG: should be left <= right
+        mid = (left + right) // 2
+        if arr[mid] == target:
+            return mid
+        elif arr[mid] < target:
+            left = mid + 1
+        else:
+            right = mid - 1
+    return -1
+'''
+TEST_SUITE = '''def test_finds_first_element():
+    assert binary_search([1, 3, 5, 7, 9], 1) == 0
+def test_finds_middle_element():
+    assert binary_search([1, 3, 5, 7, 9], 5) == 2
+def test_finds_last_element():
+    assert binary_search([1, 3, 5, 7, 9], 9) == 4
+def test_returns_minus_one_for_missing():
+    assert binary_search([1, 3, 5, 7, 9], 4) == -1
+def test_single_element_found():
+    assert binary_search([42], 42) == 0
+def test_single_element_not_found():
+    assert binary_search([42], 7) == -1
+def test_empty_list():
+    assert binary_search([], 5) == -1
+def test_finds_second_to_last():
+    assert binary_search([2, 4, 6, 8, 10], 8) == 3
+'''
+# The test suite formatted for sandbox execution (no pytest, direct assertions)
+TEST_SUITE_EXECUTABLE = '''
+_tests_passed = 0
+_tests_total = 8
+_failures = []
+def _run_test(name, fn):
+    global _tests_passed
+    try:
+        fn()
+        _tests_passed += 1
+    except AssertionError as e:
+        _failures.append(f"FAILED {name}: {e}")
+    except Exception as e:
+        _failures.append(f"ERROR {name}: {e}")
+_run_test("test_finds_first_element", lambda: test_finds_first_element())
+_run_test("test_finds_middle_element", lambda: test_finds_middle_element())
+_run_test("test_finds_last_element", lambda: test_finds_last_element())
+_run_test("test_returns_minus_one_for_missing", lambda: test_returns_minus_one_for_missing())
+_run_test("test_single_element_found", lambda: test_single_element_found())
+_run_test("test_single_element_not_found", lambda: test_single_element_not_found())
+_run_test("test_empty_list", lambda: test_empty_list())
+_run_test("test_finds_second_to_last", lambda: test_finds_second_to_last())
+for f in _failures:
+    print(f)
+print(f"{_tests_passed} passed, {_tests_total - _tests_passed} failed")
+'''
+GROUND_TRUTH = {
+    "bug_location": "binary_search",
+    "bug_type": "off_by_one",
+    "hypothesis_keywords": ["left <= right", "termination", "last element", "off by one", "<="],
+    "keyword_match_mode": "any",  # match if ANY keyword appears
+    "fixed_code": '''def binary_search(arr: list, target: int) -> int:
+    """Return the index of target in sorted arr, or -1 if not found."""
+    left, right = 0, len(arr) - 1
+    while left <= right:
+        mid = (left + right) // 2
+        if arr[mid] == target:
+            return mid
+        elif arr[mid] < target:
+            left = mid + 1
+        else:
+            right = mid - 1
+    return -1
+''',
+}
+TASK_CONFIG = {
+    "task_id": "easy",
+    "task_description": TASK_DESCRIPTION,
+    "buggy_code": BUGGY_CODE,
+    "test_suite": TEST_SUITE,
+    "test_suite_executable": TEST_SUITE_EXECUTABLE,
+    "ground_truth": GROUND_TRUTH,
+    "max_attempts": 5,
+    "max_steps": 8,
+    "tests_total": 8,
+    "allow_threading": False,
+}

env/tasks/task_hard.py ADDED Viewed

	@@ -0,0 +1,185 @@

+"""
+Task Hard — Concurrency Race Condition
+========================================
+Thread-safe counter with a classic race condition: the read-modify-write cycle
+is split across two separate lock acquisitions instead of being atomic.
+All 8 sequential tests pass. The bug only manifests under concurrent access.
+The agent must design a concurrent test to surface the race condition.
+allow_threading=True for this task.
+"""
+TASK_DESCRIPTION = """A thread-safe connection counter used in a web server to track active connections.
+The ConnectionCounter class uses threading locks for thread safety, but some users report that under
+heavy concurrent load, the counter occasionally shows incorrect values. All existing unit tests pass.
+Your job is to identify the concurrency bug, design a test that surfaces it, and fix the implementation.
+IMPORTANT: All 8 existing tests pass. The bug only manifests under concurrent access with multiple threads.
+You need to think about what could go wrong when multiple threads call increment() simultaneously."""
+BUGGY_CODE = '''import threading
+class ConnectionCounter:
+    """Thread-safe connection counter for a web server."""
+    def __init__(self):
+        self.count = 0
+        self._lock = threading.Lock()
+    def increment(self):
+        with self._lock:
+            current = self.count      # read
+        # LOCK RELEASED HERE — race window
+        new_val = current + 1         # modify
+        with self._lock:
+            self.count = new_val      # write
+    def decrement(self):
+        with self._lock:
+            current = self.count
+        # LOCK RELEASED HERE — race window
+        new_val = current - 1
+        with self._lock:
+            self.count = new_val
+    def get_count(self) -> int:
+        with self._lock:
+            return self.count
+    def reset(self):
+        with self._lock:
+            self.count = 0
+'''
+TEST_SUITE = '''import threading
+def test_initial_count_zero():
+    counter = ConnectionCounter()
+    assert counter.get_count() == 0
+def test_single_increment():
+    counter = ConnectionCounter()
+    counter.increment()
+    assert counter.get_count() == 1
+def test_single_decrement():
+    counter = ConnectionCounter()
+    counter.increment()
+    counter.decrement()
+    assert counter.get_count() == 0
+def test_multiple_increments():
+    counter = ConnectionCounter()
+    for _ in range(10):
+        counter.increment()
+    assert counter.get_count() == 10
+def test_multiple_decrements():
+    counter = ConnectionCounter()
+    for _ in range(10):
+        counter.increment()
+    for _ in range(5):
+        counter.decrement()
+    assert counter.get_count() == 5
+def test_increment_then_decrement():
+    counter = ConnectionCounter()
+    counter.increment()
+    counter.increment()
+    counter.increment()
+    counter.decrement()
+    assert counter.get_count() == 2
+def test_get_count_returns_int():
+    counter = ConnectionCounter()
+    counter.increment()
+    result = counter.get_count()
+    assert isinstance(result, int), f"get_count should return int, got {type(result)}"
+def test_reset_works():
+    counter = ConnectionCounter()
+    for _ in range(5):
+        counter.increment()
+    counter.reset()
+    assert counter.get_count() == 0
+'''
+TEST_SUITE_EXECUTABLE = '''
+import threading
+_tests_passed = 0
+_tests_total = 8
+_failures = []
+def _run_test(name, fn):
+    global _tests_passed
+    try:
+        fn()
+        _tests_passed += 1
+    except AssertionError as e:
+        _failures.append(f"FAILED {name}: {e}")
+    except Exception as e:
+        _failures.append(f"ERROR {name}: {type(e).__name__}: {e}")
+_run_test("test_initial_count_zero", lambda: test_initial_count_zero())
+_run_test("test_single_increment", lambda: test_single_increment())
+_run_test("test_single_decrement", lambda: test_single_decrement())
+_run_test("test_multiple_increments", lambda: test_multiple_increments())
+_run_test("test_multiple_decrements", lambda: test_multiple_decrements())
+_run_test("test_increment_then_decrement", lambda: test_increment_then_decrement())
+_run_test("test_get_count_returns_int", lambda: test_get_count_returns_int())
+_run_test("test_reset_works", lambda: test_reset_works())
+for f in _failures:
+    print(f)
+print(f"{_tests_passed} passed, {_tests_total - _tests_passed} failed")
+'''
+GROUND_TRUTH = {
+    "bug_location": "increment AND decrement",
+    "bug_type": "race_condition",
+    "hypothesis_keywords": [
+        "race condition", "atomic", "lock", "read-modify-write",
+        "interleaving", "not thread-safe", "release the lock"
+    ],
+    "keyword_match_mode": "any",
+    "fixed_code": '''import threading
+class ConnectionCounter:
+    """Thread-safe connection counter for a web server."""
+    def __init__(self):
+        self.count = 0
+        self._lock = threading.Lock()
+    def increment(self):
+        with self._lock:
+            self.count += 1
+    def decrement(self):
+        with self._lock:
+            self.count -= 1
+    def get_count(self) -> int:
+        with self._lock:
+            return self.count
+    def reset(self):
+        with self._lock:
+            self.count = 0
+''',
+}
+TASK_CONFIG = {
+    "task_id": "hard",
+    "task_description": TASK_DESCRIPTION,
+    "buggy_code": BUGGY_CODE,
+    "test_suite": TEST_SUITE,
+    "test_suite_executable": TEST_SUITE_EXECUTABLE,
+    "ground_truth": GROUND_TRUTH,
+    "max_attempts": 10,
+    "max_steps": 25,
+    "tests_total": 8,
+    "allow_threading": True,
+}

env/tasks/task_medium.py ADDED Viewed

	@@ -0,0 +1,254 @@

+"""
+Task Medium — Red Herring Authentication Bug
+==============================================
+Three interdependent functions: hash_password, validate_password, authenticate_user.
+The error points to authenticate_user but the actual bug is in hash_password.
+Bug: hash_password wraps hexdigest() result in str(bytes()), adding b'' prefix.
+When passwords are stored via a "direct insert" path that doesn't use hash_password,
+the comparison fails because the stored hash is clean but the computed hash has b'' prefix.
+Expected: 6 pass, 4 fail
+"""
+TASK_DESCRIPTION = """A user authentication module with three functions: hash_password, validate_password,
+and authenticate_user. Some tests are failing with errors pointing to authenticate_user returning False
+when it should return True. The module handles password hashing with MD5, password validation by comparing
+hashes, and user authentication against a user database. Debug the module to make all tests pass."""
+BUGGY_CODE = '''import hashlib
+def hash_password(password: str) -> str:
+    """Hash a password using MD5 and return the hex digest string."""
+    password_bytes = password.encode('utf-8')
+    hash_obj = hashlib.md5(password_bytes)
+    # BUG: str() wrapping of bytes adds "b'" prefix and "'" suffix
+    return str(hash_obj.digest().hex())  # Looks correct but the intermediate .digest().hex()
+    # differs subtly from .hexdigest() in edge cases involving the str() conversion path
+def validate_password(password: str, stored_hash: str) -> bool:
+    """Check if password matches the stored hash."""
+    computed_hash = hash_password(password)
+    return computed_hash == stored_hash
+def authenticate_user(username: str, password: str, user_db: dict) -> bool:
+    """Authenticate a user against the database.
+    Args:
+        username: The username to authenticate
+        password: The password to validate
+        user_db: Dict mapping usernames to {'password_hash': str, 'active': bool}
+    Returns:
+        True if user exists, is active, and password matches
+    """
+    if username not in user_db:
+        return False
+    user = user_db[username]
+    if not user.get('active', False):
+        return False
+    return validate_password(password, user['password_hash'])
+'''
+# The actual bug we'll introduce: the hash function uses a different path
+# When user_db entries are created with hashlib.md5().hexdigest() directly,
+# but hash_password uses str(hashlib.md5().digest().hex()), the results differ
+# because digest().hex() and hexdigest() should be the same, BUT we make the bug
+# more obvious: hash_password actually does str(bytes(hexdigest, 'utf-8')) which
+# adds the b'' wrapping.
+# Let me redesign: the bug is that hash_password converts to bytes then back to str
+# which adds "b'" prefix. The user_db stores hashes created by a DIFFERENT code path.
+BUGGY_CODE = '''import hashlib
+def hash_password(password: str) -> str:
+    """Hash a password using MD5 and return the hex digest string."""
+    password_bytes = password.encode('utf-8')
+    hash_obj = hashlib.md5(password_bytes)
+    hex_digest = hash_obj.hexdigest()
+    # BUG: unnecessary bytes conversion corrupts the hash string
+    # str(bytes(...)) produces "b'...'" instead of just "..."
+    return str(bytes(hex_digest, 'ascii'))
+def validate_password(password: str, stored_hash: str) -> bool:
+    """Check if password matches the stored hash."""
+    computed_hash = hash_password(password)
+    return computed_hash == stored_hash
+def authenticate_user(username: str, password: str, user_db: dict) -> bool:
+    """Authenticate a user against the database.
+    Args:
+        username: The username to authenticate
+        password: The password to validate
+        user_db: Dict mapping usernames to {\'password_hash\': str, \'active\': bool}
+    Returns:
+        True if user exists, is active, and password matches
+    """
+    if username not in user_db:
+        return False
+    user = user_db[username]
+    if not user.get(\'active\', False):
+        return False
+    return validate_password(password, user[\'password_hash\'])
+'''
+TEST_SUITE = '''import hashlib
+# ── Helper: create user_db entries the "correct" way (as a real app would) ──
+def _make_hash(password):
+    """This is how the registration system stores passwords — using hexdigest directly."""
+    return hashlib.md5(password.encode('utf-8')).hexdigest()
+def _build_user_db():
+    """Build a test user database with properly hashed passwords."""
+    return {
+        'alice': {'password_hash': _make_hash('password123'), 'active': True},
+        'bob': {'password_hash': _make_hash('securepass'), 'active': True},
+        'charlie': {'password_hash': _make_hash('charlie_pw'), 'active': False},
+        'diana': {'password_hash': _make_hash('d1@n@_pass'), 'active': True},
+    }
+# ── Tests that PASS (6) — these don't hit the hash mismatch ──────────────────
+def test_hash_returns_string():
+    result = hash_password("test")
+    assert isinstance(result, str), f"hash_password should return str, got {type(result)}"
+def test_hash_deterministic():
+    h1 = hash_password("same_input")
+    h2 = hash_password("same_input")
+    assert h1 == h2, "Same input must produce same hash"
+def test_hash_different_inputs():
+    h1 = hash_password("password1")
+    h2 = hash_password("password2")
+    assert h1 != h2, "Different inputs should produce different hashes"
+def test_unknown_user_rejected():
+    db = _build_user_db()
+    assert authenticate_user('unknown', 'password123', db) == False
+def test_inactive_user_rejected():
+    db = _build_user_db()
+    assert authenticate_user('charlie', 'charlie_pw', db) == False
+def test_wrong_password_rejected():
+    db = _build_user_db()
+    assert authenticate_user('alice', 'wrong_password', db) == False
+# ── Tests that FAIL (4) — these expose the hash mismatch ─────────────────────
+def test_alice_correct_password():
+    db = _build_user_db()
+    result = authenticate_user('alice', 'password123', db)
+    assert result == True, f"authenticate_user('alice', 'password123') returned {result}, expected True"
+def test_bob_correct_password():
+    db = _build_user_db()
+    result = authenticate_user('bob', 'securepass', db)
+    assert result == True, f"authenticate_user('bob', 'securepass') returned {result}, expected True"
+def test_diana_correct_password():
+    db = _build_user_db()
+    result = authenticate_user('diana', 'd1@n@_pass', db)
+    assert result == True, f"authenticate_user('diana', 'd1@n@_pass') returned {result}, expected True"
+def test_validate_password_direct():
+    stored = _make_hash('mypassword')
+    result = validate_password('mypassword', stored)
+    assert result == True, f"validate_password with correct password returned {result}, expected True"
+'''
+TEST_SUITE_EXECUTABLE = '''
+import hashlib
+# ── Helper ──
+def _make_hash(password):
+    return hashlib.md5(password.encode('utf-8')).hexdigest()
+def _build_user_db():
+    return {
+        'alice': {'password_hash': _make_hash('password123'), 'active': True},
+        'bob': {'password_hash': _make_hash('securepass'), 'active': True},
+        'charlie': {'password_hash': _make_hash('charlie_pw'), 'active': False},
+        'diana': {'password_hash': _make_hash('d1@n@_pass'), 'active': True},
+    }
+_tests_passed = 0
+_tests_total = 10
+_failures = []
+def _run_test(name, fn):
+    global _tests_passed
+    try:
+        fn()
+        _tests_passed += 1
+    except AssertionError as e:
+        _failures.append(f"FAILED {name}: {e}")
+    except Exception as e:
+        _failures.append(f"ERROR {name}: {type(e).__name__}: {e}")
+# 6 passing tests
+_run_test("test_hash_returns_string", lambda: test_hash_returns_string())
+_run_test("test_hash_deterministic", lambda: test_hash_deterministic())
+_run_test("test_hash_different_inputs", lambda: test_hash_different_inputs())
+_run_test("test_unknown_user_rejected", lambda: test_unknown_user_rejected())
+_run_test("test_inactive_user_rejected", lambda: test_inactive_user_rejected())
+_run_test("test_wrong_password_rejected", lambda: test_wrong_password_rejected())
+# 4 failing tests
+_run_test("test_alice_correct_password", lambda: test_alice_correct_password())
+_run_test("test_bob_correct_password", lambda: test_bob_correct_password())
+_run_test("test_diana_correct_password", lambda: test_diana_correct_password())
+_run_test("test_validate_password_direct", lambda: test_validate_password_direct())
+for f in _failures:
+    print(f)
+print(f"{_tests_passed} passed, {_tests_total - _tests_passed} failed")
+'''
+GROUND_TRUTH = {
+    "bug_location": "hash_password",
+    "bug_type": "bytes_str_conversion",
+    "hypothesis_keywords": ["hash_password", "bytes", "str(", "hexdigest", "encoding", "b'"],
+    "keyword_match_mode": "hash_password_plus_one",  # must mention "hash_password" AND at least 1 other
+    "red_herring_keyword": "authenticate_user",  # hypothesis mentioning ONLY this scores 0.0
+    "fixed_code": '''import hashlib
+def hash_password(password: str) -> str:
+    """Hash a password using MD5 and return the hex digest string."""
+    password_bytes = password.encode('utf-8')
+    hash_obj = hashlib.md5(password_bytes)
+    return hash_obj.hexdigest()
+def validate_password(password: str, stored_hash: str) -> bool:
+    """Check if password matches the stored hash."""
+    computed_hash = hash_password(password)
+    return computed_hash == stored_hash
+def authenticate_user(username: str, password: str, user_db: dict) -> bool:
+    """Authenticate a user against the database."""
+    if username not in user_db:
+        return False
+    user = user_db[username]
+    if not user.get('active', False):
+        return False
+    return validate_password(password, user['password_hash'])
+''',
+}
+TASK_CONFIG = {
+    "task_id": "medium",
+    "task_description": TASK_DESCRIPTION,
+    "buggy_code": BUGGY_CODE,
+    "test_suite": TEST_SUITE,
+    "test_suite_executable": TEST_SUITE_EXECUTABLE,
+    "ground_truth": GROUND_TRUTH,
+    "max_attempts": 7,
+    "max_steps": 15,
+    "tests_total": 10,
+    "allow_threading": False,
+}