#!/usr/bin/env python # PHASE4:evaluate_cli """ CLI entry point for running Phase 4 evaluation. Usage: poetry run python scripts/evaluate.py poetry run python scripts/evaluate.py --dataset data/golden_dataset.json poetry run python scripts/evaluate.py --run-name "my-run" --fail-on-threshold poetry run python scripts/evaluate.py --no-mlflow poetry run python scripts/evaluate.py --provider groq poetry run python scripts/evaluate.py --provider groq-quality --lexical-only """ from __future__ import annotations import argparse import logging import sys from pathlib import Path # Ensure project root is on sys.path when running directly sys.path.insert(0, str(Path(__file__).parent.parent)) from src.config.settings import Settings from src.evaluation.runner import EvaluationRunner from src.observability.logging import setup_logging def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Run RAGAS evaluation over the golden dataset." ) parser.add_argument( "--dataset", default="data/golden_dataset.json", help="Path to the golden dataset JSON (default: data/golden_dataset.json)", ) parser.add_argument( "--run-name", default="eval", help="MLflow run name / label for this evaluation (default: eval)", ) parser.add_argument( "--fail-on-threshold", action="store_true", help="Exit with code 1 if any metric is below the configured threshold", ) parser.add_argument( "--no-mlflow", action="store_true", help="Skip logging results to MLflow", ) parser.add_argument( "--lexical-only", action="store_true", help="Use lexical fallback metrics only — no LLM required for evaluation", ) parser.add_argument( "--provider", default=None, help='Override LLM_PROVIDER for this run (e.g. "ollama", "groq", "groq-quality")', ) return parser.parse_args() def main() -> int: import os # parse_args first — provider override must go into os.environ BEFORE # Settings() is instantiated, because dependencies.py uses @lru_cache on # get_settings(). Mutating a local Settings() instance has no effect on # the cached instance the agent uses. args = parse_args() if args.provider is not None: os.environ["LLM_PROVIDER"] = args.provider settings = Settings() setup_logging(settings.log_level) logger = logging.getLogger(__name__) logger.info( "starting evaluation", extra={"dataset": args.dataset, "run_name": args.run_name}, ) from src.evaluation.ragas_evaluator import RagasConfig runner = EvaluationRunner( settings=settings, ragas_config=RagasConfig( llm_model=settings.llm_model, use_llm_metrics=not args.lexical_only, ), ) report = runner.run(dataset_path=args.dataset, run_name=args.run_name) print("\n" + "=" * 60) print(report.summary()) print("=" * 60 + "\n") if not args.no_mlflow: try: from src.evaluation.mlflow_logger import MlflowLogger logger_ml = MlflowLogger() run_id = logger_ml.log( report, params={ "llm_model": settings.llm_model, "embedding_model": settings.embedding_model, "retrieval_top_k": settings.retrieval_top_k, "reranker": settings.reranker, "dataset": args.dataset, }, ) print(f"MLflow run ID: {run_id}") except Exception as exc: logger.warning("MLflow logging failed", extra={"error": str(exc)}) print(f"Warning: MLflow logging failed — {exc}") if args.fail_on_threshold and not report.passed: print("EVALUATION FAILED: one or more metrics below threshold.") return 1 return 0 if __name__ == "__main__": sys.exit(main())