Spaces:

T0X1N
/

Agentic-RagBot

Sleeping

App Files Files Community

Agentic-RagBot / tests /test_evaluation_system.py

T0X1N

chore: codebase audit and fixes (ruff, mypy, pytest)

9659593 1 day ago

raw

history blame contribute delete

7.7 kB

	"""
	Test the 5D Evaluation System
	Tests all evaluators with real diabetes patient output
	"""

	import sys
	from pathlib import Path

	sys.path.insert(0, str(Path(__file__).parent.parent))

	import json

	import pytest
	import os

	from src.evaluation.evaluators import run_full_evaluation
	from src.state import AgentOutput


	@pytest.mark.skipif(
	not os.environ.get("GROQ_API_KEY") and not os.environ.get("GOOGLE_API_KEY"), reason="No LLM API key available"
	)
	def test_evaluation_system():
	"""Test evaluation system with diabetes patient data"""

	print("=" * 80)
	print("TESTING 5D EVALUATION SYSTEM")
	print("=" * 80)

	# Load test output from diabetes patient
	test_output_path = Path(__file__).parent / "test_output_diabetes.json"
	with open(test_output_path, encoding="utf-8") as f:
	final_response = json.load(f)

	print(f"\n✓ Loaded test data from: {test_output_path}")
	print(f" - Disease: {final_response['prediction_explanation']['primary_disease']}")
	print(f" - Confidence: {final_response['prediction_explanation']['confidence']:.1%}")
	print(f" - Out of range biomarkers: {final_response['patient_summary']['biomarkers_out_of_range']}")
	print(f" - Critical alerts: {len(final_response['safety_alerts'])}")

	# Reconstruct patient biomarkers from test output
	biomarkers = {
	"Glucose": 185.0,
	"HbA1c": 8.2,
	"Cholesterol": 235.0,
	"Triglycerides": 210.0,
	"HDL": 38.0,
	"LDL": 155.0,
	"VLDL": 42.0,
	"Total_Protein": 6.8,
	"Albumin": 4.2,
	"Globulin": 2.6,
	"AG_Ratio": 1.6,
	"Bilirubin_Total": 0.9,
	"Bilirubin_Direct": 0.2,
	"ALT": 35.0,
	"AST": 28.0,
	"ALP": 95.0,
	"Creatinine": 1.1,
	"BUN": 18.0,
	"BUN_Creatinine_Ratio": 16.4,
	"Uric_Acid": 6.2,
	"WBC": 7200.0,
	"RBC": 4.7,
	"Hemoglobin": 14.2,
	"Hematocrit": 42.0,
	"Platelets": 245.0,
	}

	print(f"\n✓ Reconstructed {len(biomarkers)} biomarker values")

	# Mock agent outputs to provide PubMed context for Clinical Accuracy evaluator
	disease_explainer_context = """
	Type 2 diabetes (T2D) accounts for the majority of cases and results
	primarily from insulin resistance with a progressive beta-cell secretory defect.

	Pathophysiology:
	- Insulin resistance in peripheral tissues (muscle, liver, adipose)
	- Progressive decline in beta-cell function
	- Impaired glucose homeostasis leading to hyperglycemia
	- Long-term complications affecting cardiovascular, renal, and neurological systems

	Key Biomarkers:
	- Fasting glucose ≥126 mg/dL indicates diabetes
	- HbA1c ≥6.5% indicates diabetes
	- Elevated cholesterol and triglycerides common due to dyslipidemia
	- HDL typically reduced in metabolic syndrome

	Clinical Management:
	- Lifestyle modifications (diet, exercise)
	- Pharmacological intervention (metformin, insulin sensitizers)
	- Regular monitoring of glycemic control
	- Cardiovascular risk management
	"""

	agent_outputs = [
	AgentOutput(
	agent_name="Disease Explainer",
	findings=disease_explainer_context,
	metadata={"citations": ["diabetes.pdf", "MediGuard_Diabetes_Guidelines_Extensive.pdf"]},
	),
	AgentOutput(
	agent_name="Biomarker Analyzer",
	findings="Analyzed 25 biomarkers. Found 19 out of range, 3 critical values.",
	metadata={"citations": []},
	),
	AgentOutput(
	agent_name="Biomarker-Disease Linker",
	findings="Glucose and HbA1c are primary drivers for Type 2 Diabetes prediction.",
	metadata={"citations": ["diabetes.pdf"]},
	),
	AgentOutput(
	agent_name="Clinical Guidelines",
	findings="Recommend immediate medical consultation, lifestyle modifications.",
	metadata={"citations": ["diabetes.pdf"]},
	),
	AgentOutput(
	agent_name="Confidence Assessor",
	findings="High confidence prediction (87%) based on strong biomarker evidence.",
	metadata={"citations": []},
	),
	]

	print(f"✓ Created {len(agent_outputs)} mock agent outputs for evaluation context")

	# Run full evaluation
	print("\n" + "=" * 80)
	print("RUNNING EVALUATION PIPELINE")
	print("=" * 80)

	try:
	evaluation_result = run_full_evaluation(
	final_response=final_response, agent_outputs=agent_outputs, biomarkers=biomarkers
	)

	# Display results
	print("\n" + "=" * 80)
	print("5D EVALUATION RESULTS")
	print("=" * 80)

	print(f"\n1. 📊 Clinical Accuracy: {evaluation_result.clinical_accuracy.score:.3f}")
	print(f" Reasoning: {evaluation_result.clinical_accuracy.reasoning[:200]}...")

	print(f"\n2. 📚 Evidence Grounding: {evaluation_result.evidence_grounding.score:.3f}")
	print(f" Reasoning: {evaluation_result.evidence_grounding.reasoning}")

	print(f"\n3. ⚡ Actionability: {evaluation_result.actionability.score:.3f}")
	print(f" Reasoning: {evaluation_result.actionability.reasoning[:200]}...")

	print(f"\n4. 💡 Clarity: {evaluation_result.clarity.score:.3f}")
	print(f" Reasoning: {evaluation_result.clarity.reasoning}")

	print(f"\n5. 🛡️ Safety & Completeness: {evaluation_result.safety_completeness.score:.3f}")
	print(f" Reasoning: {evaluation_result.safety_completeness.reasoning}")

	# Summary
	print("\n" + "=" * 80)
	print("SUMMARY")
	print("=" * 80)

	scores = evaluation_result.to_vector()
	avg_score = sum(scores) / len(scores)

	print(f"\n✓ Evaluation Vector: {[f'{s:.3f}' for s in scores]}")
	print(f"✓ Average Score: {avg_score:.3f}")
	print(f"✓ Min Score: {min(scores):.3f}")
	print(f"✓ Max Score: {max(scores):.3f}")

	# Validation checks
	print("\n" + "=" * 80)
	print("VALIDATION CHECKS")
	print("=" * 80)

	all_valid = True

	for i, (name, score) in enumerate(
	[
	("Clinical Accuracy", evaluation_result.clinical_accuracy.score),
	("Evidence Grounding", evaluation_result.evidence_grounding.score),
	("Actionability", evaluation_result.actionability.score),
	("Clarity", evaluation_result.clarity.score),
	("Safety & Completeness", evaluation_result.safety_completeness.score),
	],
	1,
	):
	if 0.0 <= score <= 1.0:
	print(f"✓ {name}: Score in valid range [0.0, 1.0]")
	else:
	print(f"✗ {name}: Score OUT OF RANGE: {score}")
	all_valid = False

	if all_valid:
	print("\n" + "=" * 80)
	print("All evaluators passed validation")
	print("=" * 80)
	else:
	print("\n" + "=" * 80)
	print("Some evaluators failed validation")
	print("=" * 80)

	assert all_valid, "Some evaluators had scores out of valid range"
	assert avg_score > 0.0, "Average evaluation score should be positive"

	except Exception as e:
	print("\n" + "=" * 80)
	print("Evaluation failed")
	print("=" * 80)
	print(f"\nError: {type(e).__name__}: {e!s}")
	import traceback

	traceback.print_exc()
	raise


	if __name__ == "__main__":
	print("\nStarting 5D Evaluation System Test\n")
	test_evaluation_system()
	print("\nTest completed successfully!")