| """ |
| Unit tests for L1 graders β no network, no LLM calls. |
| |
| Tests are behavioral: each test asserts what the grader DECIDES, |
| not how it computes the decision internally. |
| """ |
|
|
| import sys |
| from pathlib import Path |
| import pytest |
|
|
| sys.path.insert(0, str(Path(__file__).parent.parent.parent / "backend")) |
|
|
| from unittest.mock import MagicMock, patch |
|
|
| import numpy as np |
|
|
| from grader import ( |
| grade_pii_leakage, |
| grade_token_budget, |
| grade_chain_terminology, |
| decompose_claims, |
| grade_faithfulness_decomposed, |
| FAITHFULNESS_THRESHOLD, |
| TOKEN_BUDGET, |
| ) |
|
|
|
|
| |
|
|
| class TestPiiLeakage: |
| def test_clean_response_passes(self) -> None: |
| result = grade_pii_leakage("Stock check is enabled for this retailer.") |
| assert result.passed is True |
| assert result.score == 1.0 |
|
|
| def test_email_address_fails(self) -> None: |
| result = grade_pii_leakage("Contact ops@example.com for details.") |
| assert result.passed is False |
| assert "email" in result.detail |
|
|
| def test_ssn_pattern_fails(self) -> None: |
| result = grade_pii_leakage("Employee SSN: 123-45-6789 is on file.") |
| assert result.passed is False |
| assert "SSN" in result.detail |
|
|
| def test_phone_number_fails(self) -> None: |
| result = grade_pii_leakage("Call 555-867-5309 to reach the manager.") |
| assert result.passed is False |
| assert result.score == 0.0 |
|
|
| def test_multiple_pii_types_all_reported(self) -> None: |
| result = grade_pii_leakage("Email ops@test.com or call 555-123-4567.") |
| assert result.passed is False |
| assert "email" in result.detail |
| assert "phone" in result.detail |
|
|
| def test_score_is_binary(self) -> None: |
| clean = grade_pii_leakage("No PII here.") |
| dirty = grade_pii_leakage("Email: a@b.com") |
| assert clean.score == 1.0 |
| assert dirty.score == 0.0 |
|
|
|
|
| |
|
|
| class TestTokenBudget: |
| def test_short_response_passes(self) -> None: |
| result = grade_token_budget("Short answer.") |
| assert result.passed is True |
| assert result.score == 1.0 |
|
|
| def test_response_at_exact_budget_passes(self) -> None: |
| text = "a" * (TOKEN_BUDGET * 4) |
| result = grade_token_budget(text) |
| assert result.passed is True |
|
|
| def test_response_over_budget_fails(self) -> None: |
| text = "a" * (TOKEN_BUDGET * 4 + 4) |
| result = grade_token_budget(text) |
| assert result.passed is False |
| assert result.score < 1.0 |
|
|
| def test_score_degrades_with_length(self) -> None: |
| moderate = grade_token_budget("a" * (TOKEN_BUDGET * 5)) |
| extreme = grade_token_budget("a" * (TOKEN_BUDGET * 20)) |
| assert moderate.score > extreme.score |
|
|
| def test_detail_reports_token_estimate(self) -> None: |
| result = grade_token_budget("hello world") |
| assert "tokens" in result.detail |
|
|
| def test_custom_budget_respected(self) -> None: |
| text = "a" * 40 |
| assert grade_token_budget(text, budget=100).passed is True |
| assert grade_token_budget(text, budget=5).passed is False |
|
|
|
|
| |
|
|
| class TestChainTerminology: |
| def test_correct_client_term_passes(self) -> None: |
| result = grade_chain_terminology( |
| "Run an availability scan to check inventory levels.", |
| client="novamart", |
| ) |
| assert result.passed is True |
|
|
| def test_rival_term_without_correct_term_fails(self) -> None: |
| |
| result = grade_chain_terminology( |
| "Run a stock check to see inventory levels.", |
| client="novamart", |
| ) |
| assert result.passed is False |
| assert any(v["expected"] == "availability scan" for v in result.metadata["violations"]) |
|
|
| def test_both_terms_present_does_not_flag(self) -> None: |
| |
| result = grade_chain_terminology( |
| "Run an availability scan (also called stock check) to check inventory.", |
| client="novamart", |
| ) |
| assert result.passed is True |
|
|
| def test_score_reflects_violation_ratio(self) -> None: |
| result = grade_chain_terminology( |
| "Run a stock check and use a feature toggle.", |
| client="novamart", |
| ) |
| assert 0.0 <= result.score < 1.0 |
|
|
| def test_clean_response_full_score(self) -> None: |
| result = grade_chain_terminology( |
| "This response uses no retail terminology at all.", |
| client="novamart", |
| ) |
| assert result.score == 1.0 |
|
|
| def test_pharma_client_rival_term_fails(self) -> None: |
| |
| result = grade_chain_terminology( |
| "Submit a prior authorization request to get the drug approved.", |
| client="pharmalink", |
| ) |
| assert result.passed is False |
| assert any(v["expected"] == "formulary pre-approval" for v in result.metadata["violations"]) |
|
|
|
|
| |
|
|
| class TestDecomposeClaims: |
| def test_single_sentence(self) -> None: |
| claims = decompose_claims("The product is in stock.") |
| assert claims == ["The product is in stock."] |
|
|
| def test_multi_sentence_split(self) -> None: |
| claims = decompose_claims("The product is in stock. It costs five dollars. Delivery takes two days.") |
| assert len(claims) == 3 |
|
|
| def test_fragments_under_three_words_excluded(self) -> None: |
| claims = decompose_claims("Yes. The product is available in all sizes.") |
| assert all(len(c.split()) >= 3 for c in claims) |
|
|
| def test_exclamation_and_question_split(self) -> None: |
| claims = decompose_claims("Stock is low! Would you like to reorder? The threshold is five units.") |
| assert len(claims) == 3 |
|
|
| def test_empty_string_returns_empty(self) -> None: |
| assert decompose_claims("") == [] |
|
|
|
|
| |
|
|
| def _make_nli(entailment: float) -> MagicMock: |
| """Mock CrossEncoder whose predict() always returns the given entailment score.""" |
| mock = MagicMock() |
| |
| mock.predict = MagicMock( |
| side_effect=lambda pairs, **kw: np.array([[0.1, entailment, 0.0]] * len(pairs)) |
| ) |
| return mock |
|
|
|
|
| CONTEXT = "The product costs five dollars.\n\nDelivery takes two days." |
|
|
|
|
| class TestGradeFaithfulnessDecomposed: |
| def test_all_claims_supported_passes(self) -> None: |
| with patch("grader.get_nli_model", return_value=_make_nli(0.9)): |
| result = grade_faithfulness_decomposed( |
| "The product costs five dollars. Delivery takes two days.", CONTEXT |
| ) |
| assert result.passed is True |
| assert result.score == 1.0 |
| assert result.metadata["claims"][0]["supported"] is True |
|
|
| def test_all_claims_unsupported_fails(self) -> None: |
| with patch("grader.get_nli_model", return_value=_make_nli(0.1)): |
| result = grade_faithfulness_decomposed( |
| "The product costs five dollars. Delivery takes two days.", CONTEXT |
| ) |
| assert result.passed is False |
| assert result.score == 0.0 |
|
|
| def test_partial_hallucination_detected(self) -> None: |
| |
| call_count = 0 |
|
|
| def side_effect(pairs: list, **kw: object) -> np.ndarray: |
| nonlocal call_count |
| call_count += 1 |
| entailment = 0.9 if call_count == 1 else 0.1 |
| return np.array([[0.1, entailment, 0.0]] * len(pairs)) |
|
|
| mock_model = MagicMock() |
| mock_model.predict = MagicMock(side_effect=side_effect) |
| with patch("grader.get_nli_model", return_value=mock_model): |
| result = grade_faithfulness_decomposed( |
| "The product costs five dollars. It was invented in 1842.", CONTEXT |
| ) |
| assert result.score == 0.5 |
| assert result.metadata["claims"][0]["supported"] is True |
| assert result.metadata["claims"][1]["supported"] is False |
|
|
| def test_refusal_sentinel_auto_passes(self) -> None: |
| result = grade_faithfulness_decomposed( |
| "NOT IN DOCUMENTS: The context does not contain information about this drug.", CONTEXT |
| ) |
| assert result.passed is True |
| assert result.score == 1.0 |
|
|
| def test_refusal_fallback_auto_passes(self) -> None: |
| result = grade_faithfulness_decomposed( |
| "I don't have enough information to answer that.", CONTEXT |
| ) |
| assert result.passed is True |
| assert result.score == 1.0 |
|
|
| def test_sentinel_plus_hallucination_not_auto_passed(self) -> None: |
| |
| with patch("grader.get_nli_model", return_value=_make_nli(0.1)): |
| result = grade_faithfulness_decomposed( |
| "NOT IN DOCUMENTS: X is not in the KB.\nHowever, it likely causes nausea and headaches.", |
| CONTEXT, |
| ) |
| assert result.passed is False |
|
|
| def test_empty_context_fails(self) -> None: |
| with patch("grader.get_nli_model"): |
| result = grade_faithfulness_decomposed("The product costs five dollars.", "") |
| assert result.passed is False |
| assert result.score == 0.0 |
|
|
| def test_metadata_shape(self) -> None: |
| with patch("grader.get_nli_model", return_value=_make_nli(0.8)): |
| result = grade_faithfulness_decomposed( |
| "The product is available. It ships in two days.", CONTEXT |
| ) |
| for entry in result.metadata["claims"]: |
| assert "claim" in entry |
| assert "score" in entry |
| assert "supported" in entry |
|
|
| def test_score_is_proportion_not_max(self) -> None: |
| """Verify score = supported/total, not max(entailment_scores).""" |
| with patch("grader.get_nli_model", return_value=_make_nli(0.9)): |
| result = grade_faithfulness_decomposed( |
| "Claim one is true. Claim two is also true. Claim three too.", CONTEXT |
| ) |
| assert result.score == 1.0 |
|
|