Agentic-RagBot / tests /test_evaluation_system.py
T0X1N's picture
chore: codebase audit and fixes (ruff, mypy, pytest)
9659593
"""
Test the 5D Evaluation System
Tests all evaluators with real diabetes patient output
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
import json
import pytest
import os
from src.evaluation.evaluators import run_full_evaluation
from src.state import AgentOutput
@pytest.mark.skipif(
not os.environ.get("GROQ_API_KEY") and not os.environ.get("GOOGLE_API_KEY"), reason="No LLM API key available"
)
def test_evaluation_system():
"""Test evaluation system with diabetes patient data"""
print("=" * 80)
print("TESTING 5D EVALUATION SYSTEM")
print("=" * 80)
# Load test output from diabetes patient
test_output_path = Path(__file__).parent / "test_output_diabetes.json"
with open(test_output_path, encoding="utf-8") as f:
final_response = json.load(f)
print(f"\nβœ“ Loaded test data from: {test_output_path}")
print(f" - Disease: {final_response['prediction_explanation']['primary_disease']}")
print(f" - Confidence: {final_response['prediction_explanation']['confidence']:.1%}")
print(f" - Out of range biomarkers: {final_response['patient_summary']['biomarkers_out_of_range']}")
print(f" - Critical alerts: {len(final_response['safety_alerts'])}")
# Reconstruct patient biomarkers from test output
biomarkers = {
"Glucose": 185.0,
"HbA1c": 8.2,
"Cholesterol": 235.0,
"Triglycerides": 210.0,
"HDL": 38.0,
"LDL": 155.0,
"VLDL": 42.0,
"Total_Protein": 6.8,
"Albumin": 4.2,
"Globulin": 2.6,
"AG_Ratio": 1.6,
"Bilirubin_Total": 0.9,
"Bilirubin_Direct": 0.2,
"ALT": 35.0,
"AST": 28.0,
"ALP": 95.0,
"Creatinine": 1.1,
"BUN": 18.0,
"BUN_Creatinine_Ratio": 16.4,
"Uric_Acid": 6.2,
"WBC": 7200.0,
"RBC": 4.7,
"Hemoglobin": 14.2,
"Hematocrit": 42.0,
"Platelets": 245.0,
}
print(f"\nβœ“ Reconstructed {len(biomarkers)} biomarker values")
# Mock agent outputs to provide PubMed context for Clinical Accuracy evaluator
disease_explainer_context = """
Type 2 diabetes (T2D) accounts for the majority of cases and results
primarily from insulin resistance with a progressive beta-cell secretory defect.
Pathophysiology:
- Insulin resistance in peripheral tissues (muscle, liver, adipose)
- Progressive decline in beta-cell function
- Impaired glucose homeostasis leading to hyperglycemia
- Long-term complications affecting cardiovascular, renal, and neurological systems
Key Biomarkers:
- Fasting glucose β‰₯126 mg/dL indicates diabetes
- HbA1c β‰₯6.5% indicates diabetes
- Elevated cholesterol and triglycerides common due to dyslipidemia
- HDL typically reduced in metabolic syndrome
Clinical Management:
- Lifestyle modifications (diet, exercise)
- Pharmacological intervention (metformin, insulin sensitizers)
- Regular monitoring of glycemic control
- Cardiovascular risk management
"""
agent_outputs = [
AgentOutput(
agent_name="Disease Explainer",
findings=disease_explainer_context,
metadata={"citations": ["diabetes.pdf", "MediGuard_Diabetes_Guidelines_Extensive.pdf"]},
),
AgentOutput(
agent_name="Biomarker Analyzer",
findings="Analyzed 25 biomarkers. Found 19 out of range, 3 critical values.",
metadata={"citations": []},
),
AgentOutput(
agent_name="Biomarker-Disease Linker",
findings="Glucose and HbA1c are primary drivers for Type 2 Diabetes prediction.",
metadata={"citations": ["diabetes.pdf"]},
),
AgentOutput(
agent_name="Clinical Guidelines",
findings="Recommend immediate medical consultation, lifestyle modifications.",
metadata={"citations": ["diabetes.pdf"]},
),
AgentOutput(
agent_name="Confidence Assessor",
findings="High confidence prediction (87%) based on strong biomarker evidence.",
metadata={"citations": []},
),
]
print(f"βœ“ Created {len(agent_outputs)} mock agent outputs for evaluation context")
# Run full evaluation
print("\n" + "=" * 80)
print("RUNNING EVALUATION PIPELINE")
print("=" * 80)
try:
evaluation_result = run_full_evaluation(
final_response=final_response, agent_outputs=agent_outputs, biomarkers=biomarkers
)
# Display results
print("\n" + "=" * 80)
print("5D EVALUATION RESULTS")
print("=" * 80)
print(f"\n1. πŸ“Š Clinical Accuracy: {evaluation_result.clinical_accuracy.score:.3f}")
print(f" Reasoning: {evaluation_result.clinical_accuracy.reasoning[:200]}...")
print(f"\n2. πŸ“š Evidence Grounding: {evaluation_result.evidence_grounding.score:.3f}")
print(f" Reasoning: {evaluation_result.evidence_grounding.reasoning}")
print(f"\n3. ⚑ Actionability: {evaluation_result.actionability.score:.3f}")
print(f" Reasoning: {evaluation_result.actionability.reasoning[:200]}...")
print(f"\n4. πŸ’‘ Clarity: {evaluation_result.clarity.score:.3f}")
print(f" Reasoning: {evaluation_result.clarity.reasoning}")
print(f"\n5. πŸ›‘οΈ Safety & Completeness: {evaluation_result.safety_completeness.score:.3f}")
print(f" Reasoning: {evaluation_result.safety_completeness.reasoning}")
# Summary
print("\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)
scores = evaluation_result.to_vector()
avg_score = sum(scores) / len(scores)
print(f"\nβœ“ Evaluation Vector: {[f'{s:.3f}' for s in scores]}")
print(f"βœ“ Average Score: {avg_score:.3f}")
print(f"βœ“ Min Score: {min(scores):.3f}")
print(f"βœ“ Max Score: {max(scores):.3f}")
# Validation checks
print("\n" + "=" * 80)
print("VALIDATION CHECKS")
print("=" * 80)
all_valid = True
for i, (name, score) in enumerate(
[
("Clinical Accuracy", evaluation_result.clinical_accuracy.score),
("Evidence Grounding", evaluation_result.evidence_grounding.score),
("Actionability", evaluation_result.actionability.score),
("Clarity", evaluation_result.clarity.score),
("Safety & Completeness", evaluation_result.safety_completeness.score),
],
1,
):
if 0.0 <= score <= 1.0:
print(f"βœ“ {name}: Score in valid range [0.0, 1.0]")
else:
print(f"βœ— {name}: Score OUT OF RANGE: {score}")
all_valid = False
if all_valid:
print("\n" + "=" * 80)
print("All evaluators passed validation")
print("=" * 80)
else:
print("\n" + "=" * 80)
print("Some evaluators failed validation")
print("=" * 80)
assert all_valid, "Some evaluators had scores out of valid range"
assert avg_score > 0.0, "Average evaluation score should be positive"
except Exception as e:
print("\n" + "=" * 80)
print("Evaluation failed")
print("=" * 80)
print(f"\nError: {type(e).__name__}: {e!s}")
import traceback
traceback.print_exc()
raise
if __name__ == "__main__":
print("\nStarting 5D Evaluation System Test\n")
test_evaluation_system()
print("\nTest completed successfully!")