Agentic-RagBot / tests /test_integration.py
T0X1N's picture
chore: codebase audit and fixes (ruff, mypy, pytest)
9659593
"""
MediGuard AI — Integration Tests
End-to-end tests verifying the complete analysis workflow.
These tests ensure all components work together correctly.
Run with: pytest tests/test_integration.py -v
"""
import os
from typing import Any
import pytest
# Set deterministic mode for evaluation tests
os.environ["EVALUATION_DETERMINISTIC"] = "true"
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
@pytest.fixture
def sample_biomarkers() -> dict[str, float]:
"""Standard diabetic biomarker panel."""
return {
"Glucose": 145,
"HbA1c": 7.2,
"Cholesterol": 220,
"LDL": 140,
"HDL": 45,
"Triglycerides": 180,
}
@pytest.fixture
def normal_biomarkers() -> dict[str, float]:
"""Normal healthy biomarkers."""
return {
"Glucose": 90,
"HbA1c": 5.2,
"Cholesterol": 180,
"LDL": 90,
"HDL": 55,
"Triglycerides": 120,
}
# ---------------------------------------------------------------------------
# Shared Utilities Tests
# ---------------------------------------------------------------------------
class TestBiomarkerParsing:
"""Tests for biomarker parsing from natural language."""
def test_parse_json_input(self):
"""Should parse valid JSON biomarker input."""
from src.shared_utils import parse_biomarkers
result = parse_biomarkers('{"Glucose": 140, "HbA1c": 7.5}')
assert result["Glucose"] == 140
assert result["HbA1c"] == 7.5
def test_parse_key_value_format(self):
"""Should parse key:value format."""
from src.shared_utils import parse_biomarkers
result = parse_biomarkers("Glucose: 140, HbA1c: 7.5")
assert result["Glucose"] == 140
assert result["HbA1c"] == 7.5
def test_parse_natural_language(self):
"""Should parse natural language with units."""
from src.shared_utils import parse_biomarkers
result = parse_biomarkers("glucose 140 mg/dL and hemoglobin 13.5 g/dL")
assert "Glucose" in result or "glucose" in result
assert 140 in result.values()
def test_normalize_biomarker_aliases(self):
"""Should normalize biomarker aliases to canonical names."""
from src.shared_utils import normalize_biomarker_name
assert normalize_biomarker_name("a1c") == "HbA1c"
assert normalize_biomarker_name("fasting glucose") == "Glucose"
assert normalize_biomarker_name("ldl-c") == "LDL"
def test_empty_input(self):
"""Should return empty dict for empty input."""
from src.shared_utils import parse_biomarkers
assert parse_biomarkers("") == {}
assert parse_biomarkers(" ") == {}
class TestDiseaseScoring:
"""Tests for rule-based disease scoring heuristics."""
def test_diabetes_scoring_diabetic(self, sample_biomarkers):
"""Should detect diabetes with elevated glucose/HbA1c."""
from src.shared_utils import score_disease_diabetes
score, severity = score_disease_diabetes(sample_biomarkers)
assert score > 0.5
assert severity in ["moderate", "high"]
def test_diabetes_scoring_normal(self, normal_biomarkers):
"""Should not flag diabetes with normal biomarkers."""
from src.shared_utils import score_disease_diabetes
score, severity = score_disease_diabetes(normal_biomarkers)
assert score < 0.3
def test_dyslipidemia_scoring(self, sample_biomarkers):
"""Should detect dyslipidemia with elevated lipids."""
from src.shared_utils import score_disease_dyslipidemia
score, severity = score_disease_dyslipidemia(sample_biomarkers)
assert score > 0.3
def test_primary_prediction(self, sample_biomarkers):
"""Should return highest-confidence prediction."""
from src.shared_utils import get_primary_prediction
result = get_primary_prediction(sample_biomarkers)
assert "disease" in result
assert "confidence" in result
assert "severity" in result
assert result["confidence"] > 0
class TestBiomarkerFlagging:
"""Tests for biomarker classification and flagging."""
def test_classify_abnormal_biomarker(self):
"""Should classify abnormal biomarkers correctly."""
from src.shared_utils import classify_biomarker
assert classify_biomarker("Glucose", 200) == "high"
assert classify_biomarker("Glucose", 50) == "low"
assert classify_biomarker("Glucose", 90) == "normal"
def test_flag_biomarkers(self, sample_biomarkers):
"""Should flag abnormal biomarkers with details."""
from src.shared_utils import flag_biomarkers
flags = flag_biomarkers(sample_biomarkers)
assert len(flags) == len(sample_biomarkers)
# Check that flagged items have expected fields
for flag in flags:
assert "name" in flag
assert "value" in flag
assert "status" in flag
# ---------------------------------------------------------------------------
# Retrieval Tests
# ---------------------------------------------------------------------------
class TestRetrieverInterface:
"""Tests for the unified retriever interface."""
def test_retrieval_result_dataclass(self):
"""Should create RetrievalResult with correct fields."""
from src.services.retrieval.interface import RetrievalResult
result = RetrievalResult(
doc_id="test-123", content="Test content about diabetes.", score=0.85, metadata={"source": "test.pdf"}
)
assert result.doc_id == "test-123"
assert result.score == 0.85
assert "diabetes" in result.content
@pytest.mark.skipif(
not os.path.exists("data/vector_stores/medical_knowledge.faiss"), reason="FAISS index not available"
)
def test_faiss_retriever_loads(self):
"""Should load FAISS retriever from local index."""
from src.services.retrieval import make_retriever
retriever = make_retriever(backend="faiss")
assert retriever.health()
assert retriever.doc_count() > 0
# ---------------------------------------------------------------------------
# Evaluation Tests
# ---------------------------------------------------------------------------
class TestEvaluationSystem:
"""Tests for the 5D evaluation system."""
@pytest.fixture
def sample_response(self) -> dict[str, Any]:
"""Sample analysis response for evaluation."""
return {
"patient_summary": {
"narrative": "Patient shows elevated blood glucose and HbA1c indicating diabetes.",
"primary_finding": "Type 2 Diabetes",
},
"prediction_explanation": {
"key_drivers": [
{"biomarker": "Glucose", "evidence": "Elevated at 145 mg/dL"},
{"biomarker": "HbA1c", "evidence": "7.2% indicates poor glycemic control"},
],
"pdf_references": [
{"source": "guidelines.pdf", "page": 12},
{"source": "diabetes.pdf", "page": 45},
],
},
"clinical_recommendations": {
"immediate_actions": ["Confirm HbA1c", "Schedule follow-up"],
"lifestyle_changes": ["Dietary modifications", "Regular exercise"],
"monitoring": ["Weekly glucose checks"],
},
"biomarker_flags": [
{"name": "Glucose", "value": 145, "status": "high"},
{"name": "HbA1c", "value": 7.2, "status": "high"},
],
"key_findings": ["Diabetes indicators present"],
}
def test_graded_score_validation(self):
"""Should validate score range 0-1."""
from src.evaluation.evaluators import GradedScore
valid = GradedScore(score=0.75, reasoning="Test")
assert valid.score == 0.75
with pytest.raises(ValueError):
GradedScore(score=1.5, reasoning="Invalid")
def test_evidence_grounding_programmatic(self, sample_response):
"""Should evaluate evidence grounding programmatically."""
from src.evaluation.evaluators import evaluate_evidence_grounding
result = evaluate_evidence_grounding(sample_response)
assert 0 <= result.score <= 1
assert "Citations" in result.reasoning or "citations" in result.reasoning.lower()
def test_safety_completeness_programmatic(self, sample_response, sample_biomarkers):
"""Should evaluate safety completeness programmatically."""
from src.evaluation.evaluators import evaluate_safety_completeness
# Add required field for safety evaluation
sample_response["confidence_assessment"] = {
"limitations": ["Requires clinical confirmation"],
"confidence_score": 0.75,
}
result = evaluate_safety_completeness(sample_response, sample_biomarkers)
assert 0 <= result.score <= 1
@pytest.mark.skipif(
not os.environ.get("GROQ_API_KEY") and not os.environ.get("GOOGLE_API_KEY"), reason="No LLM API key available"
)
def test_deterministic_clinical_accuracy(self, sample_response):
"""Should evaluate clinical accuracy deterministically."""
from src.evaluation.evaluators import evaluate_clinical_accuracy
# EVALUATION_DETERMINISTIC=true set at top of file
result = evaluate_clinical_accuracy(sample_response, "Test context")
assert 0 <= result.score <= 1
assert "[DETERMINISTIC]" in result.reasoning
def test_evaluation_result_average(self, sample_response, sample_biomarkers):
"""Should calculate average score across all dimensions."""
from src.evaluation.evaluators import EvaluationResult, GradedScore
result = EvaluationResult(
clinical_accuracy=GradedScore(score=0.8, reasoning="Good"),
evidence_grounding=GradedScore(score=0.7, reasoning="Good"),
actionability=GradedScore(score=0.9, reasoning="Good"),
clarity=GradedScore(score=0.6, reasoning="OK"),
safety_completeness=GradedScore(score=0.8, reasoning="Good"),
)
avg = result.average_score()
assert 0.7 < avg < 0.8 # (0.8+0.7+0.9+0.6+0.8)/5 = 0.76
# ---------------------------------------------------------------------------
# API Route Tests
# ---------------------------------------------------------------------------
class TestAPIRoutes:
"""Tests for FastAPI routes (requires running server or test client)."""
def test_analyze_router_import(self):
"""Should import analyze router without errors."""
from src.routers import analyze
assert hasattr(analyze, "router")
def test_health_check_import(self):
"""Should have health check endpoint."""
from src.routers import health
assert hasattr(health, "router")
# ---------------------------------------------------------------------------
# HuggingFace App Tests
# ---------------------------------------------------------------------------
class TestHuggingFaceApp:
"""Tests for HuggingFace Gradio app components."""
def test_shared_utils_import_in_hf(self):
"""HuggingFace app should import shared utilities."""
import sys
from pathlib import Path
# Add project root to path (as HF app does)
project_root = str(Path(__file__).parent.parent)
if project_root not in sys.path:
sys.path.insert(0, project_root)
from src.shared_utils import parse_biomarkers
# Should work without errors
result = parse_biomarkers("Glucose: 140")
assert "Glucose" in result or len(result) > 0
# ---------------------------------------------------------------------------
# Workflow Tests
# ---------------------------------------------------------------------------
@pytest.mark.skipif(
not os.environ.get("GROQ_API_KEY") and not os.environ.get("GOOGLE_API_KEY"), reason="No LLM API key available"
)
class TestWorkflow:
"""Tests requiring LLM API access."""
def test_create_guild(self):
"""Should create ClinicalInsightGuild without errors."""
from src.workflow import create_guild
guild = create_guild()
assert guild is not None
if __name__ == "__main__":
pytest.main([__file__, "-v"])