Spaces:

Rom89823974978
/

RAG_Eval

Sleeping

File size: 8,672 Bytes

e8c3964

#!/usr/bin/env python
"""
run_experiments.py
==================

High-level driver that wires together:

1.  YAML / CLI → `PipelineConfig` + `LoggingConfig`
2.  Initialises dual-sink logging (console + rotating file)
3.  Builds a `RAGPipeline`
4.  Streams a list of questions through the pipeline
5.  Logs progress, writes per-query JSONL results, and
    (optionally) prints aggregate statistics.

You can keep it minimal – or expand the marked TODO sections to:
* compute metrics immediately
* push results to a tracker (W&B, MLflow, etc.)
* spawn multiple configs in parallel.
"""
from __future__ import annotations

import argparse
import json
import sys
from pathlib import Path
from typing import Any, Dict, Iterable, List, Mapping

import yaml

from evaluation import (
    PipelineConfig,
    RetrieverConfig,
    GeneratorConfig,
    CrossEncoderConfig,
    StatsConfig,
    LoggingConfig,
    RAGPipeline,
)
from evaluation.utils.logger import init_logging

from evaluation.stats import (
    corr_ci,
    wilcoxon_signed_rank,
    holm_bonferroni,
)

import matplotlib.pyplot as plt

# ──────────────────────────────────────────────────────────────────────────────
# Helpers
# ──────────────────────────────────────────────────────────────────────────────


def _merge_dataclass(dc_cls, default, override: Mapping[str, Any]):
    """Return a new *dc_cls* where fields from *override* overwrite *default*."""
    from dataclasses import asdict

    merged = asdict(default)
    merged.update({k: v for k, v in override.items() if v is not None})
    return dc_cls(**merged)


def _load_pipeline_config(yaml_path: Path | None) -> PipelineConfig:
    """Parse YAML into nested dataclasses; fall back to defaults."""
    if yaml_path is None:
        return PipelineConfig()  # all defaults

    data = yaml.safe_load(yaml_path.read_text())

    retr_cfg = _merge_dataclass(
        RetrieverConfig(), RetrieverConfig(), data.get("retriever", {})
    )
    gen_cfg = _merge_dataclass(
        GeneratorConfig(), GeneratorConfig(), data.get("generator", {})
    )
    rr_cfg = _merge_dataclass(
        CrossEncoderConfig(), CrossEncoderConfig(), data.get("reranker", {})
    )
    stats_cfg = _merge_dataclass(StatsConfig(), StatsConfig(), data.get("stats", {}))
    log_cfg = _merge_dataclass(LoggingConfig(), LoggingConfig(), data.get("logging", {}))

    return PipelineConfig(
        retriever=retr_cfg,
        generator=gen_cfg,
        reranker=rr_cfg,
        stats=stats_cfg,
        logging=log_cfg,
    )


def _read_jsonl(path: Path) -> List[Dict[str, Any]]:
    with path.open() as f:
        return [json.loads(line) for line in f]


def _write_jsonl(path: Path, rows: Iterable[Mapping[str, Any]]):
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w") as f:
        for row in rows:
            f.write(json.dumps(row) + "\n")

# Stats Helper
def aggregate_metrics(rows: list[dict[str, Any]]) -> dict[str, float]:
    """Return mean of every numeric metric found under row['metrics']."""
    import numpy as np
    keys = rows[0]["metrics"].keys()
    return {k: float(np.mean([r["metrics"][k] for r in rows])) for k in keys}


def correlation_with_gold(rows: list[dict[str, Any]], cfg: StatsConfig):
    """Spearman/Kendall correlation between retrieval scores and correctness flag."""
    if "human_correct" not in rows[0]:
        return None  # nothing to correlate
    mrr = [r["metrics"].get("mrr", float("nan")) for r in rows]
    gold = [1.0 if r["human_correct"] else 0.0 for r in rows]
    r, (lo, hi), p = corr_ci(
        mrr, gold, method=cfg.correlation_method, n_boot=cfg.n_boot, ci=cfg.ci
    )
    return dict(r=r, ci_low=lo, ci_high=hi, p=p)


def wilcoxon_against_baseline(
    cur: list[dict[str, Any]],
    base: list[dict[str, Any]],
    cfg: StatsConfig,
):
    """Paired Wilcoxon + Holm-Bonferroni across all metric keys."""
    from evaluation.stats import wilcoxon_signed_rank, holm_bonferroni

    assert len(cur) == len(base), "Runs must have same #queries"
    metrics = cur[0]["metrics"].keys()
    p_raw = {}
    for m in metrics:
        cur_m = [r["metrics"][m] for r in cur]
        base_m = [r["metrics"][m] for r in base]
        _, p = wilcoxon_signed_rank(cur_m, base_m, alternative=cfg.wilcoxon_alternative)
        p_raw[m] = p
    return holm_bonferroni(p_raw)

# Plot helper
def save_scatter(rows, out_dir: Path):
    out_dir.mkdir(parents=True, exist_ok=True)
    x = [r["metrics"]["mrr"] for r in rows if "mrr" in r["metrics"]]
    y = [1.0 if r.get("human_correct") else 0.0 for r in rows]
    plt.figure()
    plt.scatter(x, y, alpha=0.6)
    plt.xlabel("MRR")
    plt.ylabel("Correct (1=yes)")
    plt.title("MRR vs. Human Correctness")
    path = out_dir / "mrr_vs_correct.png"
    plt.savefig(path, bbox_inches="tight")
    plt.close()
    return path

# ──────────────────────────────────────────────────────────────────────────────
# Main
# ──────────────────────────────────────────────────────────────────────────────
def main(argv: list[str] | None = None) -> None:
    ap = argparse.ArgumentParser(description="Run RAG evaluation experiments.")
    ap.add_argument("--config", type=Path, help="YAML config with pipeline settings")
    ap.add_argument(
        "--queries",
        type=Path,
        required=True,
        help="JSONL file – each line must contain at least {'question': ...}",
    )
    ap.add_argument(
        "--output",
        type=Path,
        default=Path("outputs/results.jsonl"),
        help="Where to write JSONL results",
    )
    ap.add_argument("--dry-run", action="store_true", help="Do not execute pipeline")
    ap.add_argument(
        "--baseline",
        type=Path,
        help="Optional: JSONL with baseline run for significance tests",
    )
    ap.add_argument(
        "--plots",
        action="store_true",
        help="Save diagnostic plots (PNG) alongside results",
    )
    args = ap.parse_args(argv)

    # 1. Parse configuration
    cfg = _load_pipeline_config(args.config)

    # 2. Initialise logging (file + stderr)
    init_logging(
        log_dir=cfg.logging.log_dir,
        level=cfg.logging.level,
        max_mb=cfg.logging.max_mb,
        backups=cfg.logging.backups,
    )

    import logging

    logger = logging.getLogger(__name__)
    logger.info("Loaded PipelineConfig:\n%s", cfg)

    # 3. Build pipeline (retrieval → (rerank) → generation)
    pipeline = RAGPipeline(cfg)

    # 4. Load queries
    rows = _read_jsonl(args.queries)
    logger.info("Loaded %d queries from %s", len(rows), args.queries)

    if args.dry_run:
        logger.warning("Dry-run flag active – exiting before execution.")
        sys.exit(0)

    # 5. Execute pipeline
    results: List[Dict[str, Any]] = []
    for i, row in enumerate(rows, 1):
        q = row["question"]
        logger.info("[%d/%d] Q: %s", i, len(rows), q)
        out = pipeline.run(q)
        merged = {**row, **out}  # keep any gold labels or metadata
        results.append(merged)

    # 6. Persist results
    _write_jsonl(args.output, results)
    logger.info("Wrote %d results to %s", len(results), args.output)

    # 7. Aggregate statistics, significance tests, plots
    agg = aggregate_metrics(results)
    logger.info("Mean metrics: %s", json.dumps(agg, indent=2))

    corr = correlation_with_gold(results, cfg.stats)
    if corr:
        logger.info(
            "Correlation MRR↔gold  %s=%.3f  95%%CI=[%.3f, %.3f]  p=%.3g",
            cfg.stats.correlation_method,
            corr["r"],
            corr["ci_low"],
            corr["ci_high"],
            corr["p"],
        )

    if args.baseline:
        baseline_rows = _read_jsonl(args.baseline)
        p_adj = wilcoxon_against_baseline(results, baseline_rows, cfg.stats)
        logger.info("Wilcoxon vs baseline (Holm-Bonferroni α=%s): %s", cfg.stats.alpha, p_adj)

    if args.plots:
        plot_path = save_scatter(results, args.output.parent)
        logger.info("Saved plot → %s", plot_path)

if __name__ == "__main__":
    main()