Spaces:

Renangi
/

ragbench-rag-eval

Running

App Files Files Community

Renangi commited on 27 days ago

Commit

c8dfbc0

0 Parent(s):

Initial commit without secrets

Browse files

Files changed (19) hide show

.gitignore +4 -0
Dockerfile +22 -0
README.md +93 -0
app/1111-main - Copy.py +41 -0
app/222222-main - Copy.py +244 -0
app/__init__.py +0 -0
app/main.py +428 -0
docker-compose.yml +12 -0
prompts/ragbench_judge_prompt.txt +6 -0
ragbench_eval/__init__.py +1 -0
ragbench_eval/config.py +26 -0
ragbench_eval/generator.py +32 -0
ragbench_eval/judge.py +60 -0
ragbench_eval/llm.py +47 -0
ragbench_eval/metrics.py +77 -0
ragbench_eval/pipeline.py +104 -0
ragbench_eval/retriever.py +32 -0
requirements.txt +10 -0
scripts/run_experiment.py +35 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+.env
+.venv/
+__pycache__/
+*.pyc

Dockerfile ADDED Viewed

	@@ -0,0 +1,22 @@

+FROM python:3.11-slim
+WORKDIR /app
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential git && \
+    rm -rf /var/lib/apt/lists/*
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY ragbench_eval ./ragbench_eval
+COPY app ./app
+COPY scripts ./scripts
+COPY prompts ./prompts
+ENV PYTHONUNBUFFERED=1
+# Hugging Face Spaces expect 7860
+EXPOSE 7860
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]

README.md ADDED Viewed

	@@ -0,0 +1,93 @@

+---
+title: ragbench-rag-eval
+emoji: "📊"
+colorFrom: blue
+colorTo: indigo
+sdk: docker
+pinned: false
+---
+# RAGBench RAG Evaluation Project
+This project evaluates a RAG system on the RAGBench dataset across 5 domains:
+Biomedical, General Knowledge, Legal, Customer Support, and Finance.
+# RAGBench RAG Evaluation Project
+This project evaluates a RAG system on the RAGBench dataset across 5 domains:
+Biomedical, General Knowledge, Legal, Customer Support, and Finance.
+## 1. Setup (local, no Docker)
+```bash
+python -m venv .venv
+source .venv/bin/activate  # Windows: .venv\\Scripts\\activate
+pip install --upgrade pip
+pip install -r requirements.txt
+```
+Copy `.env.example` to `.env` and fill in:
+- HF_TOKEN (if using Hugging Face models)
+- GROQ_API_KEY (if using Groq)
+- RAGBENCH_LLM_PROVIDER = groq or hf
+- RAGBENCH_GEN_MODEL
+- RAGBENCH_JUDGE_MODEL
+Also open `prompts/ragbench_judge_prompt.txt` and paste the official JSON
+annotation prompt from the RAGBench paper (Appendix 9.4), with placeholders:
+`{documents}`, `{question}`, `{answer}`.
+### Run an experiment from CLI
+```bash
+python -m scripts.run_experiment --domain biomedical --k 3 --max_examples 10
+```
+## 2. Run FastAPI locally (no Docker)
+```bash
+uvicorn app.main:app --host 0.0.0.0 --port 7860
+```
+Then open:
+- `http://localhost:7860/health`
+- `http://localhost:7860/docs` (Swagger UI)
+- POST `/run_domain` with JSON:
+```json
+{
+  "domain": "biomedical",
+  "k": 3,
+  "max_examples": 10,
+  "split": "test"
+}
+```
+## 3. Run with Docker (local laptop)
+Build and run:
+```bash
+docker compose build
+docker compose up
+```
+The API will be available at `http://localhost:8000`.
+## 4. Deploy to Hugging Face Space (Docker)
+1. Create a new Space with SDK = Docker.
+2. Push this repo to the Space Git URL.
+3. On the Space settings, add variables/secrets:
+   - HF_TOKEN
+   - GROQ_API_KEY
+   - RAGBENCH_LLM_PROVIDER
+   - RAGBENCH_GEN_MODEL
+   - RAGBENCH_JUDGE_MODEL
+4. Once the Space builds successfully, open `/docs` on the Space URL to run
+`/run_domain` for each domain via Swagger UI.

app/1111-main - Copy.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from fastapi import FastAPI
+from pydantic import BaseModel
+from ragbench_eval.pipeline import RagBenchExperiment
+app = FastAPI(title="RAGBench RAG Evaluation API")
+class RunRequest(BaseModel):
+    domain: str
+    k: int = 3
+    max_examples: int = 20
+    split: str = "test"
+@app.post("/run_domain")
+def run_domain(req: RunRequest):
+    exp = RagBenchExperiment(
+        k=req.k,
+        max_examples=req.max_examples,
+        split=req.split,
+    )
+    result = exp.run_domain(req.domain)
+    return result
+@app.get("/health")
+def health():
+    return {"status": "ok"}
+@app.get("/")
+def root():
+    return {
+        "message": "RAGBench RAG Evaluation API is running.",
+        "endpoints": {
+            "health": "/health",
+            "docs": "/docs",
+            "run_domain": "/run_domain (POST)",
+        },
+    }

app/222222-main - Copy.py ADDED Viewed

	@@ -0,0 +1,244 @@

+from fastapi import FastAPI
+from fastapi.responses import HTMLResponse
+from pydantic import BaseModel
+from ragbench_eval.pipeline import RagBenchExperiment
+app = FastAPI(title="RAGBench RAG Evaluation API")
+class RunRequest(BaseModel):
+    domain: str
+    k: int = 3
+    max_examples: int = 20
+    split: str = "test"
+@app.post("/run_domain")
+def run_domain(req: RunRequest):
+    exp = RagBenchExperiment(
+        k=req.k,
+        max_examples=req.max_examples,
+        split=req.split,
+    )
+    result = exp.run_domain(req.domain)
+    return result
+@app.get("/health")
+def health():
+    return {"status": "ok"}
+@app.get("/")
+def root():
+    return {
+        "message": "RAGBench RAG Evaluation API is running.",
+        "endpoints": {
+            "health": "/health",
+            "docs": "/docs",
+            "ui": "/ui",
+            "run_domain": "/run_domain (POST)",
+        },
+    }
+# ------------- NEW: simple frontend at /ui -----------------
+@app.get("/ui", response_class=HTMLResponse)
+def ui():
+    html = """
+    <!DOCTYPE html>
+    <html lang="en">
+    <head>
+      <meta charset="UTF-8" />
+      <title>RAGBench RAG Evaluation UI</title>
+      <meta name="viewport" content="width=device-width, initial-scale=1" />
+      <style>
+        body {
+          font-family: system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
+          margin: 0;
+          padding: 0;
+          background: #f5f7fa;
+          color: #111827;
+        }
+        .wrapper {
+          max-width: 960px;
+          margin: 2rem auto;
+          padding: 1.5rem;
+          background: #ffffff;
+          border-radius: 0.75rem;
+          box-shadow: 0 10px 25px rgba(0, 0, 0, 0.06);
+        }
+        h1 {
+          margin-top: 0;
+          font-size: 1.6rem;
+        }
+        .row {
+          display: flex;
+          flex-wrap: wrap;
+          gap: 1rem;
+          margin-bottom: 1rem;
+        }
+        .field {
+          flex: 1 1 180px;
+          min-width: 160px;
+        }
+        label {
+          display: block;
+          font-size: 0.85rem;
+          font-weight: 600;
+          margin-bottom: 0.25rem;
+        }
+        select, input {
+          width: 100%;
+          padding: 0.45rem 0.55rem;
+          border-radius: 0.375rem;
+          border: 1px solid #d1d5db;
+          font-size: 0.9rem;
+          box-sizing: border-box;
+        }
+        button {
+          padding: 0.55rem 1.2rem;
+          border-radius: 999px;
+          border: none;
+          background: #2563eb;
+          color: #ffffff;
+          font-weight: 600;
+          font-size: 0.95rem;
+          cursor: pointer;
+        }
+        button:disabled {
+          opacity: 0.6;
+          cursor: default;
+        }
+        .actions {
+          margin-top: 0.5rem;
+          margin-bottom: 1rem;
+        }
+        .status {
+          font-size: 0.85rem;
+          margin-bottom: 0.5rem;
+          color: #4b5563;
+        }
+        pre {
+          background: #0b1020;
+          color: #e5e7eb;
+          padding: 1rem;
+          border-radius: 0.75rem;
+          overflow: auto;
+          max-height: 480px;
+          font-size: 0.8rem;
+        }
+        code {
+          font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
+        }
+        @media (max-width: 640px) {
+          .wrapper {
+            margin: 0.5rem;
+            border-radius: 0.5rem;
+          }
+        }
+      </style>
+    </head>
+    <body>
+      <div class="wrapper">
+        <h1>RAGBench RAG Evaluation</h1>
+        <p style="font-size:0.9rem; color:#4b5563;">
+          Use this UI to call <code>POST /run_domain</code> and inspect the metrics
+          for a given domain. The backend uses the RAGBench dataset and your configured LLMs.
+        </p>
+        <div class="row">
+          <div class="field">
+            <label for="domain">Domain</label>
+            <select id="domain">
+              <option value="biomedical">Biomedical</option>
+              <option value="general_knowledge">General Knowledge</option>
+              <option value="legal">Legal</option>
+              <option value="customer_support">Customer Support</option>
+              <option value="finance">Finance</option>
+            </select>
+          </div>
+          <div class="field">
+            <label for="k">Top-k documents</label>
+            <input id="k" type="number" value="3" min="1" />
+          </div>
+          <div class="field">
+            <label for="max_examples">Max examples</label>
+            <input id="max_examples" type="number" value="5" min="1" />
+          </div>
+          <div class="field">
+            <label for="split">Dataset split</label>
+            <input id="split" type="text" value="test" />
+          </div>
+        </div>
+        <div class="actions">
+          <button id="runBtn" onclick="runDomain()">Run Domain Evaluation</button>
+        </div>
+        <div class="status" id="status"></div>
+        <pre><code id="output">{}</code></pre>
+      </div>
+      <script>
+        async function runDomain() {
+          const domainEl = document.getElementById("domain");
+          const kEl = document.getElementById("k");
+          const maxExamplesEl = document.getElementById("max_examples");
+          const splitEl = document.getElementById("split");
+          const statusEl = document.getElementById("status");
+          const outputEl = document.getElementById("output");
+          const btn = document.getElementById("runBtn");
+          const domain = domainEl.value;
+          const k = parseInt(kEl.value || "3", 10);
+          const maxExamples = parseInt(maxExamplesEl.value || "5", 10);
+          const split = splitEl.value || "test";
+          const payload = {
+            domain: domain,
+            k: k,
+            max_examples: maxExamples,
+            split: split
+          };
+          statusEl.textContent = "Running evaluation...";
+          btn.disabled = true;
+          outputEl.textContent = "{}";
+          try {
+            const res = await fetch("/run_domain", {
+              method: "POST",
+              headers: {
+                "Content-Type": "application/json"
+              },
+              body: JSON.stringify(payload)
+            });
+            const data = await res.json();
+            if (!res.ok) {
+              statusEl.textContent = "Error " + res.status;
+            } else {
+              statusEl.textContent = "Done.";
+            }
+            outputEl.textContent = JSON.stringify(data, null, 2);
+          } catch (err) {
+            statusEl.textContent = "Request failed: " + err;
+            outputEl.textContent = "{}";
+          } finally {
+            btn.disabled = false;
+          }
+        }
+      </script>
+    </body>
+    </html>
+    """
+    return HTMLResponse(content=html)

app/__init__.py ADDED Viewed

File without changes

app/main.py ADDED Viewed

	@@ -0,0 +1,428 @@

+from typing import List, Tuple
+from fastapi import FastAPI
+from fastapi.responses import HTMLResponse
+from pydantic import BaseModel
+from datasets import load_dataset
+from ragbench_eval.pipeline import RagBenchExperiment
+from ragbench_eval.retriever import ExampleRetriever
+from ragbench_eval.generator import RAGGenerator
+from ragbench_eval.judge import RAGJudge
+from ragbench_eval.metrics import trace_from_attributes
+from ragbench_eval.config import RAGBENCH_DATASET
+app = FastAPI(title="RAGBench RAG Evaluation API")
+class RunRequest(BaseModel):
+    domain: str
+    k: int = 3
+    max_examples: int = 20
+    split: str = "test"
+class QAExampleRequest(BaseModel):
+    subset: str           # e.g. "covidqa", "pubmedqa", "finqa"
+    index: int = 0        # which example in that subset
+    k: int = 3            # top-k docs
+    split: str = "test"   # usually "test"
+@app.post("/run_domain")
+def run_domain(req: RunRequest):
+    exp = RagBenchExperiment(
+        k=req.k,
+        max_examples=req.max_examples,
+        split=req.split,
+    )
+    result = exp.run_domain(req.domain)
+    return result
+@app.post("/qa_example")
+def qa_example(req: QAExampleRequest):
+    """
+    Run RAG on a single RAGBench example and return:
+    - question
+    - generated answer
+    - retrieved docs with sentence keys
+    - judge attributes
+    - predicted TRACe metrics
+    - ground-truth scores from dataset
+    """
+    ds = load_dataset(RAGBENCH_DATASET, req.subset, split=req.split)
+    if req.index < 0 or req.index >= len(ds):
+        return {"error": f"index {req.index} out of range (0..{len(ds)-1})"}
+    row = ds[req.index]
+    docs_sentences_full: List[List[Tuple[str, str]]] = []
+    for doc in row["documents_sentences"]:
+        docs_sentences_full.append([(k, s) for k, s in doc])
+    question = row["question"]
+    retriever = ExampleRetriever()
+    doc_indices = retriever.rank_docs(question, docs_sentences_full, k=req.k)
+    selected_docs = [docs_sentences_full[j] for j in doc_indices]
+    generator = RAGGenerator()
+    answer = generator.generate(question, selected_docs)
+    judge = RAGJudge()
+    attrs = judge.annotate(question, answer, selected_docs)
+    pred_metrics = trace_from_attributes(attrs, selected_docs)
+    docs_view = []
+    for doc_i, doc in enumerate(selected_docs):
+        docs_view.append({
+            "doc_index": doc_indices[doc_i],
+            "sentences": [{"key": k, "text": s} for k, s in doc],
+        })
+    return {
+        "subset": req.subset,
+        "index": req.index,
+        "question": question,
+        "answer": answer,
+        "retrieved_docs": docs_view,
+        "judge_attributes": attrs,
+        "predicted_trace_metrics": pred_metrics,
+        "ground_truth": {
+            "relevance_score": row.get("relevance_score"),
+            "utilization_score": row.get("utilization_score"),
+            "completeness_score": row.get("completeness_score"),
+            "adherence_score": row.get("adherence_score"),
+        },
+    }
+@app.get("/health")
+def health():
+    return {"status": "ok"}
+@app.get("/")
+def root():
+    return {
+        "message": "RAGBench RAG Evaluation API is running.",
+        "endpoints": {
+            "health": "/health",
+            "docs": "/docs",
+            "ui": "/ui",
+            "run_domain": "/run_domain (POST)",
+            "qa_example": "/qa_example (POST)",
+        },
+    }
+@app.get("/ui", response_class=HTMLResponse)
+def ui():
+    html = """
+    <!DOCTYPE html>
+    <html lang="en">
+    <head>
+      <meta charset="UTF-8" />
+      <title>RAGBench RAG Evaluation UI</title>
+      <meta name="viewport" content="width=device-width, initial-scale=1" />
+      <style>
+        body {
+          font-family: system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
+          margin: 0;
+          padding: 0;
+          background: #f3f4f6;
+          color: #111827;
+        }
+        .wrapper {
+          max-width: 1080px;
+          margin: 2rem auto;
+          padding: 1.5rem;
+        }
+        .card {
+          background: #ffffff;
+          border-radius: 0.75rem;
+          box-shadow: 0 10px 25px rgba(0, 0, 0, 0.06);
+          padding: 1.25rem 1.5rem;
+          margin-bottom: 1.5rem;
+        }
+        h1 {
+          margin-top: 0;
+          font-size: 1.6rem;
+        }
+        h2 {
+          margin-top: 0;
+          font-size: 1.2rem;
+        }
+        p {
+          font-size: 0.9rem;
+          color: #4b5563;
+        }
+        .row {
+          display: flex;
+          flex-wrap: wrap;
+          gap: 1rem;
+          margin-bottom: 1rem;
+        }
+        .field {
+          flex: 1 1 180px;
+          min-width: 160px;
+        }
+        label {
+          display: block;
+          font-size: 0.85rem;
+          font-weight: 600;
+          margin-bottom: 0.25rem;
+        }
+        select, input {
+          width: 100%;
+          padding: 0.45rem 0.55rem;
+          border-radius: 0.375rem;
+          border: 1px solid #d1d5db;
+          font-size: 0.9rem;
+          box-sizing: border-box;
+        }
+        button {
+          padding: 0.55rem 1.2rem;
+          border-radius: 999px;
+          border: none;
+          background: #2563eb;
+          color: #ffffff;
+          font-weight: 600;
+          font-size: 0.95rem;
+          cursor: pointer;
+        }
+        button:disabled {
+          opacity: 0.6;
+          cursor: default;
+        }
+        .actions {
+          margin-top: 0.5rem;
+          margin-bottom: 0.75rem;
+        }
+        .status {
+          font-size: 0.85rem;
+          margin-bottom: 0.5rem;
+          color: #4b5563;
+        }
+        pre {
+          background: #0b1020;
+          color: #e5e7eb;
+          padding: 1rem;
+          border-radius: 0.75rem;
+          overflow: auto;
+          max-height: 420px;
+          font-size: 0.8rem;
+        }
+        code {
+          font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
+        }
+        @media (max-width: 640px) {
+          .wrapper {
+            margin: 0.5rem;
+            padding: 0.75rem;
+          }
+          .card {
+            padding: 0.9rem 1rem;
+          }
+        }
+      </style>
+    </head>
+    <body>
+      <div class="wrapper">
+        <div class="card">
+          <h1>RAGBench RAG Evaluation</h1>
+          <p>
+            This UI lets you:
+            (1) run domain-level evaluation on RAGBench, and
+            (2) inspect a single example (question, retrieved docs, answer, and metrics).
+          </p>
+        </div>
+        <!-- Domain evaluation card -->
+        <div class="card">
+          <h2>1. Domain Evaluation (POST /run_domain)</h2>
+          <p>
+            Evaluate all subsets in a domain using the configured LLM and retriever.
+          </p>
+          <div class="row">
+            <div class="field">
+              <label for="domain">Domain</label>
+              <select id="domain">
+                <option value="biomedical">Biomedical</option>
+                <option value="general_knowledge">General Knowledge</option>
+                <option value="legal">Legal</option>
+                <option value="customer_support">Customer Support</option>
+                <option value="finance">Finance</option>
+              </select>
+            </div>
+            <div class="field">
+              <label for="k">Top-k documents</label>
+              <input id="k" type="number" value="3" min="1" />
+            </div>
+            <div class="field">
+              <label for="max_examples">Max examples</label>
+              <input id="max_examples" type="number" value="5" min="1" />
+            </div>
+            <div class="field">
+              <label for="split">Dataset split</label>
+              <input id="split" type="text" value="test" />
+            </div>
+          </div>
+          <div class="actions">
+            <button id="runBtn" onclick="runDomain()">Run Domain Evaluation</button>
+          </div>
+          <div class="status" id="status"></div>
+          <pre><code id="output">{}</code></pre>
+        </div>
+        <!-- Single example viewer card -->
+        <div class="card">
+          <h2>2. Single Example Viewer (POST /qa_example)</h2>
+          <p>
+            Inspect one RAGBench example: question, retrieved documents, answer,
+            judge attributes, and TRACe metrics.
+          </p>
+          <div class="row">
+            <div class="field">
+              <label for="subset">Subset</label>
+              <input list="subset-list" id="subset" value="covidqa" />
+              <datalist id="subset-list">
+                <option value="pubmedqa">
+                <option value="covidqa">
+                <option value="hotpotqa">
+                <option value="msmarco">
+                <option value="hagrid">
+                <option value="expertqa">
+                <option value="cuad">
+                <option value="delucionqa">
+                <option value="emanual">
+                <option value="techqa">
+                <option value="finqa">
+                <option value="tatqa">
+              </datalist>
+            </div>
+            <div class="field">
+              <label for="example_index">Example index</label>
+              <input id="example_index" type="number" value="0" min="0" />
+            </div>
+            <div class="field">
+              <label for="k_example">Top-k documents</label>
+              <input id="k_example" type="number" value="3" min="1" />
+            </div>
+            <div class="field">
+              <label for="split_example">Dataset split</label>
+              <input id="split_example" type="text" value="test" />
+            </div>
+          </div>
+          <div class="actions">
+            <button id="qaBtn" onclick="runExample()">Run Single Example</button>
+          </div>
+          <div class="status" id="qa_status"></div>
+          <pre><code id="qa_output">{}</code></pre>
+        </div>
+      </div>
+      <script>
+        async function runDomain() {
+          const domainEl = document.getElementById("domain");
+          const kEl = document.getElementById("k");
+          const maxExamplesEl = document.getElementById("max_examples");
+          const splitEl = document.getElementById("split");
+          const statusEl = document.getElementById("status");
+          const outputEl = document.getElementById("output");
+          const btn = document.getElementById("runBtn");
+          const payload = {
+            domain: domainEl.value,
+            k: parseInt(kEl.value || "3", 10),
+            max_examples: parseInt(maxExamplesEl.value || "5", 10),
+            split: splitEl.value || "test"
+          };
+          statusEl.textContent = "Running domain evaluation...";
+          btn.disabled = true;
+          outputEl.textContent = "{}";
+          try {
+            const res = await fetch("/run_domain", {
+              method: "POST",
+              headers: { "Content-Type": "application/json" },
+              body: JSON.stringify(payload)
+            });
+            const data = await res.json();
+            if (!res.ok) {
+              statusEl.textContent = "Error " + res.status;
+            } else {
+              statusEl.textContent = "Done.";
+            }
+            outputEl.textContent = JSON.stringify(data, null, 2);
+          } catch (err) {
+            statusEl.textContent = "Request failed: " + err;
+            outputEl.textContent = "{}";
+          } finally {
+            btn.disabled = false;
+          }
+        }
+        async function runExample() {
+          const subsetEl = document.getElementById("subset");
+          const indexEl = document.getElementById("example_index");
+          const kEl = document.getElementById("k_example");
+          const splitEl = document.getElementById("split_example");
+          const statusEl = document.getElementById("qa_status");
+          const outputEl = document.getElementById("qa_output");
+          const btn = document.getElementById("qaBtn");
+          const payload = {
+            subset: subsetEl.value,
+            index: parseInt(indexEl.value || "0", 10),
+            k: parseInt(kEl.value || "3", 10),
+            split: splitEl.value || "test"
+          };
+          statusEl.textContent = "Running single example...";
+          btn.disabled = true;
+          outputEl.textContent = "{}";
+          try {
+            const res = await fetch("/qa_example", {
+              method: "POST",
+              headers: { "Content-Type": "application/json" },
+              body: JSON.stringify(payload)
+            });
+            const data = await res.json();
+            if (!res.ok) {
+              statusEl.textContent = "Error " + res.status;
+            } else if (data.error) {
+              statusEl.textContent = "Backend error: " + data.error;
+            } else {
+              statusEl.textContent = "Done.";
+            }
+            outputEl.textContent = JSON.stringify(data, null, 2);
+          } catch (err) {
+            statusEl.textContent = "Request failed: " + err;
+            outputEl.textContent = "{}";
+          } finally {
+            btn.disabled = false;
+          }
+        }
+      </script>
+    </body>
+    </html>
+    """
+    return HTMLResponse(content=html)

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,12 @@

+version: "3.9"
+services:
+  ragbench-api:
+    build: .
+    ports:
+      - "8000:8000"
+    environment:
+      HF_TOKEN: "${HF_TOKEN}"
+      GROQ_API_KEY: "${GROQ_API_KEY}"
+      RAGBENCH_LLM_PROVIDER: "${RAGBENCH_LLM_PROVIDER:-groq}"
+      RAGBENCH_GEN_MODEL: "${RAGBENCH_GEN_MODEL:-llama3-8b-8192}"
+      RAGBENCH_JUDGE_MODEL: "${RAGBENCH_JUDGE_MODEL:-llama3-70b-8192}"

prompts/ragbench_judge_prompt.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+IMPORTANT: Replace this file content with the official JSON-format judge prompt
+from the RAGBench paper (Appendix 9.4). Keep the placeholders:
+{documents}
+{question}
+{answer}
+exactly as they are used in their template.

ragbench_eval/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __all__ = []

ragbench_eval/config.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import os
+from dotenv import load_dotenv
+load_dotenv()
+HF_TOKEN = os.getenv("HF_TOKEN")
+GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+LLM_PROVIDER = os.getenv("RAGBENCH_LLM_PROVIDER", "groq")  # "groq" or "hf"
+GEN_MODEL = os.getenv("RAGBENCH_GEN_MODEL", "llama3-8b-8192")
+JUDGE_MODEL = os.getenv("RAGBENCH_JUDGE_MODEL", "llama3-70b-8192")
+EMBEDDING_MODEL = os.getenv(
+    "RAGBENCH_EMBEDDING_MODEL",
+    "sentence-transformers/all-MiniLM-L6-v2",
+)
+RAGBENCH_DATASET = os.getenv("RAGBENCH_DATASET", "galileo-ai/ragbench")
+DOMAIN_TO_SUBSETS = {
+    "biomedical": ["pubmedqa", "covidqa"],
+    "general_knowledge": ["hotpotqa", "msmarco", "hagrid", "expertqa"],
+    "legal": ["cuad"],
+    "customer_support": ["delucionqa", "emanual", "techqa"],
+    "finance": ["finqa", "tatqa"],
+}

ragbench_eval/generator.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from typing import List, Tuple
+from .llm import LLMClient
+from .config import GEN_MODEL
+def build_context_from_docs(
+    docs_sentences: List[List[Tuple[str, str]]]
+) -> str:
+    chunks = []
+    for doc in docs_sentences:
+        text = " ".join(sent for _, sent in doc)
+        chunks.append(text)
+    return "\n\n".join(chunks)
+class RAGGenerator:
+    def __init__(self):
+        self.client = LLMClient(GEN_MODEL)
+    def generate(self, question: str, docs_sentences: List[List[Tuple[str, str]]]) -> str:  # noqa: E501
+        context = build_context_from_docs(docs_sentences)
+        prompt = (
+            "Use the following pieces of context to answer the question.\n\n"
+            f"{context}\n\n"
+            f"Question: {question}\n\n"
+            "Answer:"
+        )
+        messages = [
+            {"role": "system", "content": "You are a precise, grounded QA assistant."},  # noqa: E501
+            {"role": "user", "content": prompt},
+        ]
+        return self.client.chat(messages)

ragbench_eval/judge.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import json
+from pathlib import Path
+from typing import Any, Dict, List, Tuple
+from .llm import LLMClient
+from .config import JUDGE_MODEL
+def format_docs_with_keys(
+    documents_sentences: List[List[Tuple[str, str]]]
+) -> str:
+    blocks = []
+    for doc in documents_sentences:
+        for key, sent in doc:
+            blocks.append(f"{key}: {sent}")
+        blocks.append("")  # blank line
+    return "\n".join(blocks).strip()
+class RAGJudge:
+    def __init__(self, prompt_path: str = "prompts/ragbench_judge_prompt.txt"):
+        self.client = LLMClient(JUDGE_MODEL)
+        self.prompt_template = Path(prompt_path).read_text(encoding="utf-8")
+    def annotate(
+        self,
+        question: str,
+        answer: str,
+        docs_sentences: List[List[Tuple[str, str]]],
+    ) -> Dict[str, Any]:
+        docs_block = format_docs_with_keys(docs_sentences)
+        prompt = self.prompt_template.format(
+            documents=docs_block,
+            question=question,
+            answer=answer,
+        )
+        messages = [
+            {
+                "role": "system",
+                "content": "You are an evaluator that outputs STRICT JSON only.",
+            },
+            {"role": "user", "content": prompt},
+        ]
+        raw = self.client.chat(messages, max_tokens=2048)
+        try:
+            data = json.loads(raw)
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Judge JSON parse error: {e}\nRaw: {raw[:500]}")
+        for key in [
+            "relevance_explanation",
+            "all_relevant_sentence_keys",
+            "overall_supported_explanation",
+            "overall_supported",
+            "sentence_support_information",
+            "all_utilized_sentence_keys",
+        ]:
+            if key not in data:
+                raise ValueError(f"Missing key in judge output: {key}")
+        return data

ragbench_eval/llm.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from typing import List, Dict
+from .config import LLM_PROVIDER, HF_TOKEN, GROQ_API_KEY
+from huggingface_hub import InferenceClient
+from groq import Groq
+class LLMClient:
+    def __init__(self, model: str, is_chat: bool = True):
+        self.provider = LLM_PROVIDER
+        self.model = model
+        self.is_chat = is_chat
+        if self.provider == "hf":
+            if not HF_TOKEN:
+                raise RuntimeError("HF_TOKEN is required for HF provider")
+            self.client = InferenceClient(token=HF_TOKEN)
+        elif self.provider == "groq":
+            if not GROQ_API_KEY:
+                raise RuntimeError("GROQ_API_KEY is required for Groq provider")
+            self.client = Groq(api_key=GROQ_API_KEY)
+        else:
+            raise ValueError(f"Unsupported provider {self.provider}")
+    def chat(self, messages: List[Dict[str, str]], max_tokens: int = 1024) -> str:
+        if self.provider == "hf":
+            prompt = ""
+            for m in messages:
+                role = m.get("role", "user")
+                content = m.get("content", "")
+                prompt += f"[{role.upper()}]\n{content}\n"
+            out = self.client.text_generation(
+                prompt,
+                model=self.model,
+                max_new_tokens=max_tokens,
+                temperature=0.2,
+                do_sample=False,
+            )
+            return out
+        else:
+            resp = self.client.chat.completions.create(
+                model=self.model,
+                messages=messages,
+                max_tokens=max_tokens,
+                temperature=0.2,
+            )
+            return resp.choices[0].message.content

ragbench_eval/metrics.py ADDED Viewed

	@@ -0,0 +1,77 @@

+from typing import Any, Dict, List, Tuple
+import numpy as np
+from sklearn.metrics import mean_squared_error, roc_auc_score
+def _all_sentence_keys(
+    docs_sentences: List[List[Tuple[str, str]]]
+) -> List[str]:
+    keys: List[str] = []
+    for doc in docs_sentences:
+        for key, _ in doc:
+            keys.append(key)
+    return keys
+def trace_from_attributes(
+    attrs: Dict[str, Any],
+    docs_sentences: List[List[Tuple[str, str]]],
+) -> Dict[str, float]:
+    all_keys = _all_sentence_keys(docs_sentences)
+    total = len(all_keys)
+    if total == 0:
+        return {
+            "relevance": 0.0,
+            "utilization": 0.0,
+            "completeness": 0.0,
+            "adherence": 0.0,
+        }
+    relevant = set(attrs.get("all_relevant_sentence_keys", [])) & set(all_keys)
+    utilized = set(attrs.get("all_utilized_sentence_keys", [])) & set(all_keys)
+    relevance = len(relevant) / total if total > 0 else 0.0
+    utilization = len(utilized) / total if total > 0 else 0.0
+    completeness = (
+        len(relevant & utilized) / len(relevant) if relevant else 0.0
+    )
+    adherence = 1.0 if attrs.get("overall_supported", False) else 0.0
+    return {
+        "relevance": float(relevance),
+        "utilization": float(utilization),
+        "completeness": float(completeness),
+        "adherence": float(adherence),
+    }
+def compute_rmse_auc(
+    y_true_rel: List[float],
+    y_pred_rel: List[float],
+    y_true_util: List[float],
+    y_pred_util: List[float],
+    y_true_comp: List[float],
+    y_pred_comp: List[float],
+    y_true_adh: List[int],
+    y_pred_adh: List[float],
+) -> Dict[str, float]:
+    metrics = {
+        "rmse_relevance": float(
+            mean_squared_error(y_true_rel, y_pred_rel, squared=False)
+        ),
+        "rmse_utilization": float(
+            mean_squared_error(y_true_util, y_pred_util, squared=False)
+        ),
+        "rmse_completeness": float(
+            mean_squared_error(y_true_comp, y_pred_comp, squared=False)
+        ),
+    }
+    if len(set(y_true_adh)) > 1:
+        metrics["auroc_adherence"] = float(
+            roc_auc_score(y_true_adh, y_pred_adh)
+        )
+    else:
+        metrics["auroc_adherence"] = float("nan")
+    return metrics

ragbench_eval/pipeline.py ADDED Viewed

	@@ -0,0 +1,104 @@

+from typing import Dict, Any, List, Tuple, Optional
+from datasets import load_dataset
+from .config import RAGBENCH_DATASET, DOMAIN_TO_SUBSETS
+from .retriever import ExampleRetriever
+from .generator import RAGGenerator
+from .judge import RAGJudge
+from .metrics import trace_from_attributes, compute_rmse_auc
+class RagBenchExperiment:
+    def __init__(
+        self,
+        k: int = 3,
+        max_examples: Optional[int] = None,
+        split: str = "test",
+    ):
+        self.k = k
+        self.max_examples = max_examples
+        self.split = split
+        self.retriever = ExampleRetriever()
+        self.generator = RAGGenerator()
+        self.judge = RAGJudge()
+    def _load_subset(self, subset: str):
+        ds = load_dataset(
+            RAGBENCH_DATASET, subset, split=self.split
+        )
+        return ds
+    def _to_docs_sentences(self, row) -> List[List[Tuple[str, str]]]:
+        docs: List[List[Tuple[str, str]]] = []
+        for doc in row["documents_sentences"]:
+            docs.append([(k, s) for k, s in doc])
+        return docs
+    def run_subset(self, subset: str) -> Dict[str, Any]:
+        ds = self._load_subset(subset)
+        y_true_rel: List[float] = []
+        y_pred_rel: List[float] = []
+        y_true_util: List[float] = []
+        y_pred_util: List[float] = []
+        y_true_comp: List[float] = []
+        y_pred_comp: List[float] = []
+        y_true_adh: List[int] = []
+        y_pred_adh: List[float] = []
+        for i, row in enumerate(ds):
+            if self.max_examples is not None and i >= self.max_examples:
+                break
+            question = row["question"]
+            docs_sentences_full = self._to_docs_sentences(row)
+            doc_indices = self.retriever.rank_docs(
+                question, docs_sentences_full, k=self.k
+            )
+            selected_docs = [docs_sentences_full[j] for j in doc_indices]
+            answer = self.generator.generate(question, selected_docs)
+            attrs = self.judge.annotate(question, answer, selected_docs)
+            pred = trace_from_attributes(attrs, selected_docs)
+            y_true_rel.append(float(row["relevance_score"]))
+            y_true_util.append(float(row["utilization_score"]))
+            y_true_comp.append(float(row["completeness_score"]))
+            y_true_adh.append(int(row["adherence_score"]))
+            y_pred_rel.append(pred["relevance"])
+            y_pred_util.append(pred["utilization"])
+            y_pred_comp.append(pred["completeness"])
+            y_pred_adh.append(pred["adherence"])
+        metrics = compute_rmse_auc(
+            y_true_rel,
+            y_pred_rel,
+            y_true_util,
+            y_pred_util,
+            y_true_comp,
+            y_pred_comp,
+            y_true_adh,
+            y_pred_adh,
+        )
+        return {
+            "subset": subset,
+            "n_examples": len(y_true_rel),
+            **metrics,
+        }
+    def run_domain(self, domain: str) -> Dict[str, Any]:
+        subsets = DOMAIN_TO_SUBSETS[domain]
+        results = []
+        for subset in subsets:
+            res = self.run_subset(subset)
+            results.append(res)
+        return {
+            "domain": domain,
+            "subsets": results,
+        }

ragbench_eval/retriever.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from typing import List, Tuple
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+from .config import EMBEDDING_MODEL
+class ExampleRetriever:
+    """Ranks the per-example documents in RAGBench by similarity to the question."""  # noqa: E501
+    def __init__(self):
+        self.embedder = SentenceTransformer(EMBEDDING_MODEL)
+    def _encode(self, texts: List[str]) -> np.ndarray:
+        return self.embedder.encode(texts, show_progress_bar=False)
+    def rank_docs(
+        self,
+        question: str,
+        documents_sentences: List[List[Tuple[str, str]]],
+        k: int = 4,
+    ) -> List[int]:
+        doc_texts = [
+            " ".join(sent for _, sent in doc) for doc in documents_sentences
+        ]
+        q_emb = self._encode([question])
+        d_emb = self._encode(doc_texts)
+        sims = cosine_similarity(q_emb, d_emb)[0]
+        topk_idx = np.argsort(sims)[::-1][:k]
+        return topk_idx.tolist()

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+datasets==2.21.0
+sentence-transformers==3.0.1
+scikit-learn==1.5.2
+numpy==1.26.4
+pydantic==2.9.2
+fastapi==0.115.5
+uvicorn[standard]==0.32.0
+python-dotenv==1.0.1
+huggingface_hub[inference]==0.26.2
+groq==0.9.0

scripts/run_experiment.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import argparse
+import json
+from ragbench_eval.pipeline import RagBenchExperiment
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--domain",
+        type=str,
+        required=True,
+        choices=[
+            "biomedical",
+            "general_knowledge",
+            "legal",
+            "customer_support",
+            "finance",
+        ],
+    )
+    parser.add_argument("--k", type=int, default=3)
+    parser.add_argument("--max_examples", type=int, default=50)
+    parser.add_argument("--split", type=str, default="test")
+    args = parser.parse_args()
+    exp = RagBenchExperiment(
+        k=args.k,
+        max_examples=args.max_examples,
+        split=args.split,
+    )
+    results = exp.run_domain(args.domain)
+    print(json.dumps(results, indent=2))
+if __name__ == "__main__":
+    main()