agentic-rag / scripts /evaluate.py
schoon's picture
feat: production-grade agentic RAG pipeline over UK legislation
ba1b7a4
raw
history blame contribute delete
4.06 kB
#!/usr/bin/env python
# PHASE4:evaluate_cli
"""
CLI entry point for running Phase 4 evaluation.
Usage:
poetry run python scripts/evaluate.py
poetry run python scripts/evaluate.py --dataset data/golden_dataset.json
poetry run python scripts/evaluate.py --run-name "my-run" --fail-on-threshold
poetry run python scripts/evaluate.py --no-mlflow
poetry run python scripts/evaluate.py --provider groq
poetry run python scripts/evaluate.py --provider groq-quality --lexical-only
"""
from __future__ import annotations
import argparse
import logging
import sys
from pathlib import Path
# Ensure project root is on sys.path when running directly
sys.path.insert(0, str(Path(__file__).parent.parent))
from src.config.settings import Settings
from src.evaluation.runner import EvaluationRunner
from src.observability.logging import setup_logging
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Run RAGAS evaluation over the golden dataset."
)
parser.add_argument(
"--dataset",
default="data/golden_dataset.json",
help="Path to the golden dataset JSON (default: data/golden_dataset.json)",
)
parser.add_argument(
"--run-name",
default="eval",
help="MLflow run name / label for this evaluation (default: eval)",
)
parser.add_argument(
"--fail-on-threshold",
action="store_true",
help="Exit with code 1 if any metric is below the configured threshold",
)
parser.add_argument(
"--no-mlflow",
action="store_true",
help="Skip logging results to MLflow",
)
parser.add_argument(
"--lexical-only",
action="store_true",
help="Use lexical fallback metrics only β€” no LLM required for evaluation",
)
parser.add_argument(
"--provider",
default=None,
help='Override LLM_PROVIDER for this run (e.g. "ollama", "groq", "groq-quality")',
)
return parser.parse_args()
def main() -> int:
import os
# parse_args first β€” provider override must go into os.environ BEFORE
# Settings() is instantiated, because dependencies.py uses @lru_cache on
# get_settings(). Mutating a local Settings() instance has no effect on
# the cached instance the agent uses.
args = parse_args()
if args.provider is not None:
os.environ["LLM_PROVIDER"] = args.provider
settings = Settings()
setup_logging(settings.log_level)
logger = logging.getLogger(__name__)
logger.info(
"starting evaluation",
extra={"dataset": args.dataset, "run_name": args.run_name},
)
from src.evaluation.ragas_evaluator import RagasConfig
runner = EvaluationRunner(
settings=settings,
ragas_config=RagasConfig(
llm_model=settings.llm_model,
use_llm_metrics=not args.lexical_only,
),
)
report = runner.run(dataset_path=args.dataset, run_name=args.run_name)
print("\n" + "=" * 60)
print(report.summary())
print("=" * 60 + "\n")
if not args.no_mlflow:
try:
from src.evaluation.mlflow_logger import MlflowLogger
logger_ml = MlflowLogger()
run_id = logger_ml.log(
report,
params={
"llm_model": settings.llm_model,
"embedding_model": settings.embedding_model,
"retrieval_top_k": settings.retrieval_top_k,
"reranker": settings.reranker,
"dataset": args.dataset,
},
)
print(f"MLflow run ID: {run_id}")
except Exception as exc:
logger.warning("MLflow logging failed", extra={"error": str(exc)})
print(f"Warning: MLflow logging failed β€” {exc}")
if args.fail_on_threshold and not report.passed:
print("EVALUATION FAILED: one or more metrics below threshold.")
return 1
return 0
if __name__ == "__main__":
sys.exit(main())