Spaces:

T0X1N
/

Agentic-RagBot

Running

App Files Files Community

Agentic-RagBot / tests /test_integration.py

T0X1N

chore: codebase audit and fixes (ruff, mypy, pytest)

9659593 1 day ago

raw

history blame contribute delete

12.6 kB

	"""
	MediGuard AI — Integration Tests

	End-to-end tests verifying the complete analysis workflow.
	These tests ensure all components work together correctly.

	Run with: pytest tests/test_integration.py -v
	"""

	import os
	from typing import Any

	import pytest

	# Set deterministic mode for evaluation tests
	os.environ["EVALUATION_DETERMINISTIC"] = "true"


	# ---------------------------------------------------------------------------
	# Fixtures
	# ---------------------------------------------------------------------------


	@pytest.fixture
	def sample_biomarkers() -> dict[str, float]:
	"""Standard diabetic biomarker panel."""
	return {
	"Glucose": 145,
	"HbA1c": 7.2,
	"Cholesterol": 220,
	"LDL": 140,
	"HDL": 45,
	"Triglycerides": 180,
	}


	@pytest.fixture
	def normal_biomarkers() -> dict[str, float]:
	"""Normal healthy biomarkers."""
	return {
	"Glucose": 90,
	"HbA1c": 5.2,
	"Cholesterol": 180,
	"LDL": 90,
	"HDL": 55,
	"Triglycerides": 120,
	}


	# ---------------------------------------------------------------------------
	# Shared Utilities Tests
	# ---------------------------------------------------------------------------


	class TestBiomarkerParsing:
	"""Tests for biomarker parsing from natural language."""

	def test_parse_json_input(self):
	"""Should parse valid JSON biomarker input."""
	from src.shared_utils import parse_biomarkers

	result = parse_biomarkers('{"Glucose": 140, "HbA1c": 7.5}')

	assert result["Glucose"] == 140
	assert result["HbA1c"] == 7.5

	def test_parse_key_value_format(self):
	"""Should parse key:value format."""
	from src.shared_utils import parse_biomarkers

	result = parse_biomarkers("Glucose: 140, HbA1c: 7.5")

	assert result["Glucose"] == 140
	assert result["HbA1c"] == 7.5

	def test_parse_natural_language(self):
	"""Should parse natural language with units."""
	from src.shared_utils import parse_biomarkers

	result = parse_biomarkers("glucose 140 mg/dL and hemoglobin 13.5 g/dL")

	assert "Glucose" in result or "glucose" in result
	assert 140 in result.values()

	def test_normalize_biomarker_aliases(self):
	"""Should normalize biomarker aliases to canonical names."""
	from src.shared_utils import normalize_biomarker_name

	assert normalize_biomarker_name("a1c") == "HbA1c"
	assert normalize_biomarker_name("fasting glucose") == "Glucose"
	assert normalize_biomarker_name("ldl-c") == "LDL"

	def test_empty_input(self):
	"""Should return empty dict for empty input."""
	from src.shared_utils import parse_biomarkers

	assert parse_biomarkers("") == {}
	assert parse_biomarkers(" ") == {}


	class TestDiseaseScoring:
	"""Tests for rule-based disease scoring heuristics."""

	def test_diabetes_scoring_diabetic(self, sample_biomarkers):
	"""Should detect diabetes with elevated glucose/HbA1c."""
	from src.shared_utils import score_disease_diabetes

	score, severity = score_disease_diabetes(sample_biomarkers)

	assert score > 0.5
	assert severity in ["moderate", "high"]

	def test_diabetes_scoring_normal(self, normal_biomarkers):
	"""Should not flag diabetes with normal biomarkers."""
	from src.shared_utils import score_disease_diabetes

	score, severity = score_disease_diabetes(normal_biomarkers)

	assert score < 0.3

	def test_dyslipidemia_scoring(self, sample_biomarkers):
	"""Should detect dyslipidemia with elevated lipids."""
	from src.shared_utils import score_disease_dyslipidemia

	score, severity = score_disease_dyslipidemia(sample_biomarkers)

	assert score > 0.3

	def test_primary_prediction(self, sample_biomarkers):
	"""Should return highest-confidence prediction."""
	from src.shared_utils import get_primary_prediction

	result = get_primary_prediction(sample_biomarkers)

	assert "disease" in result
	assert "confidence" in result
	assert "severity" in result
	assert result["confidence"] > 0


	class TestBiomarkerFlagging:
	"""Tests for biomarker classification and flagging."""

	def test_classify_abnormal_biomarker(self):
	"""Should classify abnormal biomarkers correctly."""
	from src.shared_utils import classify_biomarker

	assert classify_biomarker("Glucose", 200) == "high"
	assert classify_biomarker("Glucose", 50) == "low"
	assert classify_biomarker("Glucose", 90) == "normal"

	def test_flag_biomarkers(self, sample_biomarkers):
	"""Should flag abnormal biomarkers with details."""
	from src.shared_utils import flag_biomarkers

	flags = flag_biomarkers(sample_biomarkers)

	assert len(flags) == len(sample_biomarkers)

	# Check that flagged items have expected fields
	for flag in flags:
	assert "name" in flag
	assert "value" in flag
	assert "status" in flag


	# ---------------------------------------------------------------------------
	# Retrieval Tests
	# ---------------------------------------------------------------------------


	class TestRetrieverInterface:
	"""Tests for the unified retriever interface."""

	def test_retrieval_result_dataclass(self):
	"""Should create RetrievalResult with correct fields."""
	from src.services.retrieval.interface import RetrievalResult

	result = RetrievalResult(
	doc_id="test-123", content="Test content about diabetes.", score=0.85, metadata={"source": "test.pdf"}
	)

	assert result.doc_id == "test-123"
	assert result.score == 0.85
	assert "diabetes" in result.content

	@pytest.mark.skipif(
	not os.path.exists("data/vector_stores/medical_knowledge.faiss"), reason="FAISS index not available"
	)
	def test_faiss_retriever_loads(self):
	"""Should load FAISS retriever from local index."""
	from src.services.retrieval import make_retriever

	retriever = make_retriever(backend="faiss")

	assert retriever.health()
	assert retriever.doc_count() > 0


	# ---------------------------------------------------------------------------
	# Evaluation Tests
	# ---------------------------------------------------------------------------


	class TestEvaluationSystem:
	"""Tests for the 5D evaluation system."""

	@pytest.fixture
	def sample_response(self) -> dict[str, Any]:
	"""Sample analysis response for evaluation."""
	return {
	"patient_summary": {
	"narrative": "Patient shows elevated blood glucose and HbA1c indicating diabetes.",
	"primary_finding": "Type 2 Diabetes",
	},
	"prediction_explanation": {
	"key_drivers": [
	{"biomarker": "Glucose", "evidence": "Elevated at 145 mg/dL"},
	{"biomarker": "HbA1c", "evidence": "7.2% indicates poor glycemic control"},
	],
	"pdf_references": [
	{"source": "guidelines.pdf", "page": 12},
	{"source": "diabetes.pdf", "page": 45},
	],
	},
	"clinical_recommendations": {
	"immediate_actions": ["Confirm HbA1c", "Schedule follow-up"],
	"lifestyle_changes": ["Dietary modifications", "Regular exercise"],
	"monitoring": ["Weekly glucose checks"],
	},
	"biomarker_flags": [
	{"name": "Glucose", "value": 145, "status": "high"},
	{"name": "HbA1c", "value": 7.2, "status": "high"},
	],
	"key_findings": ["Diabetes indicators present"],
	}

	def test_graded_score_validation(self):
	"""Should validate score range 0-1."""
	from src.evaluation.evaluators import GradedScore

	valid = GradedScore(score=0.75, reasoning="Test")
	assert valid.score == 0.75

	with pytest.raises(ValueError):
	GradedScore(score=1.5, reasoning="Invalid")

	def test_evidence_grounding_programmatic(self, sample_response):
	"""Should evaluate evidence grounding programmatically."""
	from src.evaluation.evaluators import evaluate_evidence_grounding

	result = evaluate_evidence_grounding(sample_response)

	assert 0 <= result.score <= 1
	assert "Citations" in result.reasoning or "citations" in result.reasoning.lower()

	def test_safety_completeness_programmatic(self, sample_response, sample_biomarkers):
	"""Should evaluate safety completeness programmatically."""
	from src.evaluation.evaluators import evaluate_safety_completeness

	# Add required field for safety evaluation
	sample_response["confidence_assessment"] = {
	"limitations": ["Requires clinical confirmation"],
	"confidence_score": 0.75,
	}

	result = evaluate_safety_completeness(sample_response, sample_biomarkers)

	assert 0 <= result.score <= 1

	@pytest.mark.skipif(
	not os.environ.get("GROQ_API_KEY") and not os.environ.get("GOOGLE_API_KEY"), reason="No LLM API key available"
	)
	def test_deterministic_clinical_accuracy(self, sample_response):
	"""Should evaluate clinical accuracy deterministically."""
	from src.evaluation.evaluators import evaluate_clinical_accuracy

	# EVALUATION_DETERMINISTIC=true set at top of file
	result = evaluate_clinical_accuracy(sample_response, "Test context")

	assert 0 <= result.score <= 1
	assert "[DETERMINISTIC]" in result.reasoning

	def test_evaluation_result_average(self, sample_response, sample_biomarkers):
	"""Should calculate average score across all dimensions."""
	from src.evaluation.evaluators import EvaluationResult, GradedScore

	result = EvaluationResult(
	clinical_accuracy=GradedScore(score=0.8, reasoning="Good"),
	evidence_grounding=GradedScore(score=0.7, reasoning="Good"),
	actionability=GradedScore(score=0.9, reasoning="Good"),
	clarity=GradedScore(score=0.6, reasoning="OK"),
	safety_completeness=GradedScore(score=0.8, reasoning="Good"),
	)

	avg = result.average_score()

	assert 0.7 < avg < 0.8 # (0.8+0.7+0.9+0.6+0.8)/5 = 0.76


	# ---------------------------------------------------------------------------
	# API Route Tests
	# ---------------------------------------------------------------------------


	class TestAPIRoutes:
	"""Tests for FastAPI routes (requires running server or test client)."""

	def test_analyze_router_import(self):
	"""Should import analyze router without errors."""
	from src.routers import analyze

	assert hasattr(analyze, "router")

	def test_health_check_import(self):
	"""Should have health check endpoint."""
	from src.routers import health

	assert hasattr(health, "router")


	# ---------------------------------------------------------------------------
	# HuggingFace App Tests
	# ---------------------------------------------------------------------------


	class TestHuggingFaceApp:
	"""Tests for HuggingFace Gradio app components."""

	def test_shared_utils_import_in_hf(self):
	"""HuggingFace app should import shared utilities."""
	import sys
	from pathlib import Path

	# Add project root to path (as HF app does)
	project_root = str(Path(__file__).parent.parent)
	if project_root not in sys.path:
	sys.path.insert(0, project_root)

	from src.shared_utils import parse_biomarkers

	# Should work without errors
	result = parse_biomarkers("Glucose: 140")
	assert "Glucose" in result or len(result) > 0


	# ---------------------------------------------------------------------------
	# Workflow Tests
	# ---------------------------------------------------------------------------


	@pytest.mark.skipif(
	not os.environ.get("GROQ_API_KEY") and not os.environ.get("GOOGLE_API_KEY"), reason="No LLM API key available"
	)
	class TestWorkflow:
	"""Tests requiring LLM API access."""

	def test_create_guild(self):
	"""Should create ClinicalInsightGuild without errors."""
	from src.workflow import create_guild

	guild = create_guild()

	assert guild is not None


	if __name__ == "__main__":
	pytest.main([__file__, "-v"])