Spaces:

below-threshold
/

ai-response-validator

Running

ai-response-validator / tests /unit /test_grader.py

mbochniak01

Fix sentinel edge cases: hallucination combo guard + UI formatting

8d335e4 1 day ago

11.1 kB

	"""
	Unit tests for L1 graders — no network, no LLM calls.

	Tests are behavioral: each test asserts what the grader DECIDES,
	not how it computes the decision internally.
	"""

	import sys
	from pathlib import Path
	import pytest

	sys.path.insert(0, str(Path(__file__).parent.parent.parent / "backend"))

	from unittest.mock import MagicMock, patch

	import numpy as np

	from grader import (
	grade_pii_leakage,
	grade_token_budget,
	grade_chain_terminology,
	decompose_claims,
	grade_faithfulness_decomposed,
	FAITHFULNESS_THRESHOLD,
	TOKEN_BUDGET,
	)


	# ── pii_leakage ──────────────────────────────────────────────────────────────

	class TestPiiLeakage:
	def test_clean_response_passes(self) -> None:
	result = grade_pii_leakage("Stock check is enabled for this retailer.")
	assert result.passed is True
	assert result.score == 1.0

	def test_email_address_fails(self) -> None:
	result = grade_pii_leakage("Contact ops@example.com for details.")
	assert result.passed is False
	assert "email" in result.detail

	def test_ssn_pattern_fails(self) -> None:
	result = grade_pii_leakage("Employee SSN: 123-45-6789 is on file.")
	assert result.passed is False
	assert "SSN" in result.detail

	def test_phone_number_fails(self) -> None:
	result = grade_pii_leakage("Call 555-867-5309 to reach the manager.")
	assert result.passed is False
	assert result.score == 0.0

	def test_multiple_pii_types_all_reported(self) -> None:
	result = grade_pii_leakage("Email ops@test.com or call 555-123-4567.")
	assert result.passed is False
	assert "email" in result.detail
	assert "phone" in result.detail

	def test_score_is_binary(self) -> None:
	clean = grade_pii_leakage("No PII here.")
	dirty = grade_pii_leakage("Email: a@b.com")
	assert clean.score == 1.0
	assert dirty.score == 0.0


	# ── token_budget ──────────────────────────────────────────────────────────────

	class TestTokenBudget:
	def test_short_response_passes(self) -> None:
	result = grade_token_budget("Short answer.")
	assert result.passed is True
	assert result.score == 1.0

	def test_response_at_exact_budget_passes(self) -> None:
	text = "a" * (TOKEN_BUDGET * 4)
	result = grade_token_budget(text)
	assert result.passed is True

	def test_response_over_budget_fails(self) -> None:
	text = "a" * (TOKEN_BUDGET * 4 + 4)
	result = grade_token_budget(text)
	assert result.passed is False
	assert result.score < 1.0

	def test_score_degrades_with_length(self) -> None:
	moderate = grade_token_budget("a" * (TOKEN_BUDGET * 5))
	extreme = grade_token_budget("a" * (TOKEN_BUDGET * 20))
	assert moderate.score > extreme.score

	def test_detail_reports_token_estimate(self) -> None:
	result = grade_token_budget("hello world")
	assert "tokens" in result.detail

	def test_custom_budget_respected(self) -> None:
	text = "a" * 40 # ~10 tokens
	assert grade_token_budget(text, budget=100).passed is True
	assert grade_token_budget(text, budget=5).passed is False


	# ── chain_terminology ─────────────────────────────────────────────────────────

	class TestChainTerminology:
	def test_correct_client_term_passes(self) -> None:
	result = grade_chain_terminology(
	"Run an availability scan to check inventory levels.",
	client="novamart",
	)
	assert result.passed is True

	def test_rival_term_without_correct_term_fails(self) -> None:
	# "stock check" is ShelfWise term for STOCK_CHECK — wrong for NovaMart
	result = grade_chain_terminology(
	"Run a stock check to see inventory levels.",
	client="novamart",
	)
	assert result.passed is False
	assert any(v["expected"] == "availability scan" for v in result.metadata["violations"])

	def test_both_terms_present_does_not_flag(self) -> None:
	# Response explains both — not a violation
	result = grade_chain_terminology(
	"Run an availability scan (also called stock check) to check inventory.",
	client="novamart",
	)
	assert result.passed is True

	def test_score_reflects_violation_ratio(self) -> None:
	result = grade_chain_terminology(
	"Run a stock check and use a feature toggle.",
	client="novamart",
	)
	assert 0.0 <= result.score < 1.0

	def test_clean_response_full_score(self) -> None:
	result = grade_chain_terminology(
	"This response uses no retail terminology at all.",
	client="novamart",
	)
	assert result.score == 1.0

	def test_pharma_client_rival_term_fails(self) -> None:
	# "prior authorization" is ClinixOne term — wrong for PharmaLink
	result = grade_chain_terminology(
	"Submit a prior authorization request to get the drug approved.",
	client="pharmalink",
	)
	assert result.passed is False
	assert any(v["expected"] == "formulary pre-approval" for v in result.metadata["violations"])


	# ── decompose_claims ──────────────────────────────────────────────────────────

	class TestDecomposeClaims:
	def test_single_sentence(self) -> None:
	claims = decompose_claims("The product is in stock.")
	assert claims == ["The product is in stock."]

	def test_multi_sentence_split(self) -> None:
	claims = decompose_claims("The product is in stock. It costs five dollars. Delivery takes two days.")
	assert len(claims) == 3

	def test_fragments_under_three_words_excluded(self) -> None:
	claims = decompose_claims("Yes. The product is available in all sizes.")
	assert all(len(c.split()) >= 3 for c in claims)

	def test_exclamation_and_question_split(self) -> None:
	claims = decompose_claims("Stock is low! Would you like to reorder? The threshold is five units.")
	assert len(claims) == 3

	def test_empty_string_returns_empty(self) -> None:
	assert decompose_claims("") == []


	# ── grade_faithfulness_decomposed ────────────────────────────────────────────

	def _make_nli(entailment: float) -> MagicMock:
	"""Mock CrossEncoder whose predict() always returns the given entailment score."""
	mock = MagicMock()
	# columns: [contradiction, entailment, neutral]
	mock.predict = MagicMock(
	side_effect=lambda pairs, *kw: np.array([[0.1, entailment, 0.0]] len(pairs))
	)
	return mock


	CONTEXT = "The product costs five dollars.\n\nDelivery takes two days."


	class TestGradeFaithfulnessDecomposed:
	def test_all_claims_supported_passes(self) -> None:
	with patch("grader.get_nli_model", return_value=_make_nli(0.9)):
	result = grade_faithfulness_decomposed(
	"The product costs five dollars. Delivery takes two days.", CONTEXT
	)
	assert result.passed is True
	assert result.score == 1.0
	assert result.metadata["claims"][0]["supported"] is True

	def test_all_claims_unsupported_fails(self) -> None:
	with patch("grader.get_nli_model", return_value=_make_nli(0.1)):
	result = grade_faithfulness_decomposed(
	"The product costs five dollars. Delivery takes two days.", CONTEXT
	)
	assert result.passed is False
	assert result.score == 0.0

	def test_partial_hallucination_detected(self) -> None:
	# first claim supported, second not — whole-response NLI would miss this
	call_count = 0

	def side_effect(pairs: list, **kw: object) -> np.ndarray:
	nonlocal call_count
	call_count += 1
	entailment = 0.9 if call_count == 1 else 0.1
	return np.array([[0.1, entailment, 0.0]] * len(pairs))

	mock_model = MagicMock()
	mock_model.predict = MagicMock(side_effect=side_effect)
	with patch("grader.get_nli_model", return_value=mock_model):
	result = grade_faithfulness_decomposed(
	"The product costs five dollars. It was invented in 1842.", CONTEXT
	)
	assert result.score == 0.5
	assert result.metadata["claims"][0]["supported"] is True
	assert result.metadata["claims"][1]["supported"] is False

	def test_refusal_sentinel_auto_passes(self) -> None:
	result = grade_faithfulness_decomposed(
	"NOT IN DOCUMENTS: The context does not contain information about this drug.", CONTEXT
	)
	assert result.passed is True
	assert result.score == 1.0

	def test_refusal_fallback_auto_passes(self) -> None:
	result = grade_faithfulness_decomposed(
	"I don't have enough information to answer that.", CONTEXT
	)
	assert result.passed is True
	assert result.score == 1.0

	def test_sentinel_plus_hallucination_not_auto_passed(self) -> None:
	# Sentinel on first line but additional claims follow — must be NLI-scored.
	with patch("grader.get_nli_model", return_value=_make_nli(0.1)):
	result = grade_faithfulness_decomposed(
	"NOT IN DOCUMENTS: X is not in the KB.\nHowever, it likely causes nausea and headaches.",
	CONTEXT,
	)
	assert result.passed is False

	def test_empty_context_fails(self) -> None:
	with patch("grader.get_nli_model"):
	result = grade_faithfulness_decomposed("The product costs five dollars.", "")
	assert result.passed is False
	assert result.score == 0.0

	def test_metadata_shape(self) -> None:
	with patch("grader.get_nli_model", return_value=_make_nli(0.8)):
	result = grade_faithfulness_decomposed(
	"The product is available. It ships in two days.", CONTEXT
	)
	for entry in result.metadata["claims"]:
	assert "claim" in entry
	assert "score" in entry
	assert "supported" in entry

	def test_score_is_proportion_not_max(self) -> None:
	"""Verify score = supported/total, not max(entailment_scores)."""
	with patch("grader.get_nli_model", return_value=_make_nli(0.9)):
	result = grade_faithfulness_decomposed(
	"Claim one is true. Claim two is also true. Claim three too.", CONTEXT
	)
	assert result.score == 1.0