Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python | |
| # PHASE4:evaluate_cli | |
| """ | |
| CLI entry point for running Phase 4 evaluation. | |
| Usage: | |
| poetry run python scripts/evaluate.py | |
| poetry run python scripts/evaluate.py --dataset data/golden_dataset.json | |
| poetry run python scripts/evaluate.py --run-name "my-run" --fail-on-threshold | |
| poetry run python scripts/evaluate.py --no-mlflow | |
| poetry run python scripts/evaluate.py --provider groq | |
| poetry run python scripts/evaluate.py --provider groq-quality --lexical-only | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import logging | |
| import sys | |
| from pathlib import Path | |
| # Ensure project root is on sys.path when running directly | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| from src.config.settings import Settings | |
| from src.evaluation.runner import EvaluationRunner | |
| from src.observability.logging import setup_logging | |
| def parse_args() -> argparse.Namespace: | |
| parser = argparse.ArgumentParser( | |
| description="Run RAGAS evaluation over the golden dataset." | |
| ) | |
| parser.add_argument( | |
| "--dataset", | |
| default="data/golden_dataset.json", | |
| help="Path to the golden dataset JSON (default: data/golden_dataset.json)", | |
| ) | |
| parser.add_argument( | |
| "--run-name", | |
| default="eval", | |
| help="MLflow run name / label for this evaluation (default: eval)", | |
| ) | |
| parser.add_argument( | |
| "--fail-on-threshold", | |
| action="store_true", | |
| help="Exit with code 1 if any metric is below the configured threshold", | |
| ) | |
| parser.add_argument( | |
| "--no-mlflow", | |
| action="store_true", | |
| help="Skip logging results to MLflow", | |
| ) | |
| parser.add_argument( | |
| "--lexical-only", | |
| action="store_true", | |
| help="Use lexical fallback metrics only β no LLM required for evaluation", | |
| ) | |
| parser.add_argument( | |
| "--provider", | |
| default=None, | |
| help='Override LLM_PROVIDER for this run (e.g. "ollama", "groq", "groq-quality")', | |
| ) | |
| return parser.parse_args() | |
| def main() -> int: | |
| import os | |
| # parse_args first β provider override must go into os.environ BEFORE | |
| # Settings() is instantiated, because dependencies.py uses @lru_cache on | |
| # get_settings(). Mutating a local Settings() instance has no effect on | |
| # the cached instance the agent uses. | |
| args = parse_args() | |
| if args.provider is not None: | |
| os.environ["LLM_PROVIDER"] = args.provider | |
| settings = Settings() | |
| setup_logging(settings.log_level) | |
| logger = logging.getLogger(__name__) | |
| logger.info( | |
| "starting evaluation", | |
| extra={"dataset": args.dataset, "run_name": args.run_name}, | |
| ) | |
| from src.evaluation.ragas_evaluator import RagasConfig | |
| runner = EvaluationRunner( | |
| settings=settings, | |
| ragas_config=RagasConfig( | |
| llm_model=settings.llm_model, | |
| use_llm_metrics=not args.lexical_only, | |
| ), | |
| ) | |
| report = runner.run(dataset_path=args.dataset, run_name=args.run_name) | |
| print("\n" + "=" * 60) | |
| print(report.summary()) | |
| print("=" * 60 + "\n") | |
| if not args.no_mlflow: | |
| try: | |
| from src.evaluation.mlflow_logger import MlflowLogger | |
| logger_ml = MlflowLogger() | |
| run_id = logger_ml.log( | |
| report, | |
| params={ | |
| "llm_model": settings.llm_model, | |
| "embedding_model": settings.embedding_model, | |
| "retrieval_top_k": settings.retrieval_top_k, | |
| "reranker": settings.reranker, | |
| "dataset": args.dataset, | |
| }, | |
| ) | |
| print(f"MLflow run ID: {run_id}") | |
| except Exception as exc: | |
| logger.warning("MLflow logging failed", extra={"error": str(exc)}) | |
| print(f"Warning: MLflow logging failed β {exc}") | |
| if args.fail_on_threshold and not report.passed: | |
| print("EVALUATION FAILED: one or more metrics below threshold.") | |
| return 1 | |
| return 0 | |
| if __name__ == "__main__": | |
| sys.exit(main()) | |