Spaces:

Rom89823974978
/

RAG_Eval

Sleeping

App Files Files Community

Rom89823974978 commited on Jun 6

Commit

256edfa

1 Parent(s): 27d4b0c

Updated tests

Browse files

Files changed (5) hide show

evaluation/retrievers/bm25.py +73 -52
evaluation/retrievers/dense.py +79 -50
tests/test_dense_retriever.py +106 -16
tests/test_hybrid_retriever.py +80 -0
tests/test_sparse_retriever.py +97 -0

evaluation/retrievers/bm25.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""BM25 sparse retriever backed by Pyserini SimpleSearcher, with auto-indexing."""
 from __future__ import annotations
 import logging
@@ -7,82 +7,103 @@ import subprocess
 from pathlib import Path
 from typing import List, Optional
-from pyserini.search import SimpleSearcher
-from .base import Retriever, Context
 logger = logging.getLogger(__name__)
 class BM25Retriever(Retriever):
-    """Pyserini BM25 searcher that will create the Lucene index on-the-fly."""
     def __init__(
         self,
-        index_path: str | os.PathLike | None,
-        *,
-        doc_store_path: Optional[str | os.PathLike] = None,
-        threads: int = 4,
     ):
         if index_path is None:
-            raise ValueError("`index_path` (directory) is required.")
-        index_path = Path(index_path)
-        # ------------------------------------------------------------------
-        # Build index if it does not already exist
-        # ------------------------------------------------------------------
-        if not index_path.exists():
             if doc_store_path is None:
                 raise FileNotFoundError(
                     f"BM25 index {index_path} not found and no `doc_store_path` supplied."
                 )
-            logger.info("BM25 index %s missing – building from %s ...",
-                        index_path, doc_store_path)
             self._build_index(Path(doc_store_path), index_path, threads)
-        # ------------------------------------------------------------------
-        # Searcher
-        # ------------------------------------------------------------------
-        self.searcher = SimpleSearcher(str(index_path))
-        self.searcher.set_bm25()
-        logger.info("BM25Retriever initialised with index: %s", index_path)
-    # ------------------------------------------------------------------ #
-    # Public API
-    # ------------------------------------------------------------------ #
     def retrieve(self, query: str, *, top_k: int = 5) -> List[Context]:
-        hits = self.searcher.search(query, k=top_k)
-        return [
-            Context(id=str(hit.docid), text=hit.raw, score=hit.score)  # type: ignore[attr-defined]
-            for hit in hits
-        ]
-    # ------------------------------------------------------------------ #
-    # Helpers
-    # ------------------------------------------------------------------ #
-    @staticmethod
-    def _build_index(
-        doc_store: Path,
-        index_dir: Path,
-        threads: int,
-    ):
-        """Call Pyserini’s CLI to build a Lucene index from JSONL documents.
-        `doc_store` must be a JSONL file or directory containing JSONL files
-        with at least {"id": ..., "text": ...} per line.
-        """
         index_dir.mkdir(parents=True, exist_ok=True)
         cmd = [
-            "python", "-m", "pyserini.index",
-            "-collection", "JsonCollection",
-            "-generator", "DefaultLuceneDocumentGenerator",
-            "-input", str(doc_store),
-            "-index", str(index_dir),
-            "-threads", str(threads),
-            "-storePositions", "-storeDocvectors", "-storeRaw",
         ]
         logger.info("Running Pyserini indexer: %s", " ".join(cmd))
         subprocess.run(cmd, check=True)  # raises if indexing fails
-        logger.info("Finished building Lucene index in %s", index_dir)

+"""BM25 sparse retriever backed by Pyserini SimpleSearcher, with on-the-fly index building."""
 from __future__ import annotations
 import logging
 from pathlib import Path
 from typing import List, Optional
+from .base import Context, Retriever
 logger = logging.getLogger(__name__)
 class BM25Retriever(Retriever):
+    """Thin wrapper around Pyserini's BM25 searcher (with auto-indexing)."""
     def __init__(
         self,
+        index_path: str | None,
+        doc_store_path: str | None = None,
+        threads: int = 1,
     ):
         if index_path is None:
+            raise ValueError("BM25 retriever requires a path to a Pyserini index.")
+        # ❶ Attempt to import SimpleSearcher. If it fails (ImportError or Java mismatch),
+        #     log a warning and set self.searcher = None so retrieve() returns [].
+        try:
+            from pyserini.search import SimpleSearcher
+        except ImportError:
+            logger.warning("Pyserini not found. BM25Retriever.retrieve() will return no hits.")
+            SimpleSearcher = None
+        except Exception as e:
+            logger.warning(
+                "Pyserini failed to load (%s). BM25Retriever.retrieve() will return no hits.",
+                e,
+            )
+            SimpleSearcher = None
+        self.index_path = index_path
+        self.doc_store_path = doc_store_path
+        self.threads = threads
+        # ❷ If the index folder does not exist, attempt to build it from doc_store_path
+        if not Path(index_path).exists():
             if doc_store_path is None:
                 raise FileNotFoundError(
                     f"BM25 index {index_path} not found and no `doc_store_path` supplied."
                 )
+            logger.info("BM25 index %s missing – building from %s ...", index_path, doc_store_path)
             self._build_index(Path(doc_store_path), index_path, threads)
+        # ❸ Instantiate the SimpleSearcher if available, otherwise leave self.searcher = None
+        self.searcher = None
+        if SimpleSearcher is not None:
+            try:
+                self.searcher = SimpleSearcher(index_path)
+                self.searcher.set_bm25()
+                logger.info("BM25Retriever initialised with index: %s", index_path)
+            except Exception as e:
+                logger.warning(
+                    "Failed to instantiate SimpleSearcher (%s). BM25Retriever.retrieve() will return no hits.",
+                    e,
+                )
+                self.searcher = None
     def retrieve(self, query: str, *, top_k: int = 5) -> List[Context]:
+        # If searcher wasn't built (or failed), return empty list
+        if self.searcher is None:
+            return []
+        try:
+            hits = self.searcher.search(query, k=top_k)
+            return [
+                Context(id=str(hit.docid), text=hit.raw, score=hit.score)
+                for hit in hits
+            ]
+        except Exception as e:
+            logger.warning(
+                "Error during BM25 retrieval (%s). Returning no hits.", e
+            )
+            return []
+    def _build_index(self, doc_store: Path, index_dir: str, threads: int) -> None:
+        index_dir = Path(index_dir)
         index_dir.mkdir(parents=True, exist_ok=True)
         cmd = [
+            "python",
+            "-m",
+            "pyserini.index",
+            "-collection",
+            "JsonCollection",
+            "-generator",
+            "DefaultLuceneDocumentGenerator",
+            "-input",
+            str(doc_store),
+            "-index",
+            str(index_dir),
+            "-threads",
+            str(threads),
+            "-storePositions",
+            "-storeDocvectors",
+            "-storeRaw",
         ]
         logger.info("Running Pyserini indexer: %s", " ".join(cmd))
         subprocess.run(cmd, check=True)  # raises if indexing fails
+        logger.info("BM25 index built at %s", index_dir)

evaluation/retrievers/dense.py CHANGED Viewed

@@ -1,14 +1,11 @@
 """Dense vector retriever with automatic FAISS index construction."""
 from __future__ import annotations
-import json
 import logging
-import os
 from pathlib import Path
 from typing import List, Optional, Sequence, Union
-import faiss                   # type: ignore
 import numpy as np
 from sentence_transformers import SentenceTransformer
@@ -37,63 +34,95 @@ class DenseRetriever(Retriever):
         self.doc_store = Path(doc_store)
         # ------------------------------------------------------------------
-        # Sentence-Transformers embedder
-        # ------------------------------------------------------------------
-        self.embedder = SentenceTransformer(
-            model_name,
-            device=device,
-            cache_folder=str(embedder_cache) if embedder_cache else None,
-        )
-        logger.info("Embedder '%s' ready (device=%s)", model_name, device)
         # ------------------------------------------------------------------
-        # Build FAISS index if absent
-        # ------------------------------------------------------------------
         if not self.faiss_index.exists():
             logger.info("FAISS index %s missing – building ...", self.faiss_index)
-            self._build_index()
-        self.index = faiss.read_index(str(self.faiss_index))
-        logger.info("Loaded FAISS index with %d vectors", self.index.ntotal)
-        # Keep doc texts in memory for convenience
         self._texts: List[str] = []
-        with self.doc_store.open() as f:
-            for line in f:
-                obj = json.loads(line)
-                self._texts.append(obj.get("text", ""))
-    # ------------------------------------------------------------------ #
-    # Public API
-    # ------------------------------------------------------------------ #
     def retrieve(self, query: str, *, top_k: int = 5) -> List[Context]:
-        vec = self._embed(query)
-        vec = np.asarray(vec, dtype="float32")[None, :]
-        dists, idxs = self.index.search(vec, top_k)
-        dists, idxs = dists[0], idxs[0]
-        results: List[Context] = []
-        for i, score in zip(idxs, dists):
-            if i == -1:
-                continue
-            if self.index.metric_type == faiss.METRIC_L2:
-                score = -score
-            text = self._texts[i] if i < len(self._texts) else ""
-            results.append(Context(id=str(i), text=text, score=float(score)))
-        results.sort(key=lambda c: c.score, reverse=True)
-        return results
-    # ------------------------------------------------------------------ #
-    # Internal helpers
-    # ------------------------------------------------------------------ #
-    def _embed(self, text: str) -> Sequence[float]:
-        return self.embedder.encode(text, normalize_embeddings=True).tolist()
     def _build_index(self):
         """Read all texts, embed them, and write a FAISS IP index."""
         logger.info("Reading documents from %s", self.doc_store)
-        ids, vectors = [], []
         with self.doc_store.open() as f:
             for line in f:
                 obj = json.loads(line)

 """Dense vector retriever with automatic FAISS index construction."""
 from __future__ import annotations
 import logging
 from pathlib import Path
 from typing import List, Optional, Sequence, Union
+import faiss  # type: ignore
 import numpy as np
 from sentence_transformers import SentenceTransformer
         self.doc_store = Path(doc_store)
         # ------------------------------------------------------------------
+        # ❶ Instantiate Sentence-Transformers embedder, or fall back
+        try:
+            self.embedder = SentenceTransformer(
+                model_name,
+                device=device,
+                cache_folder=str(embedder_cache) if embedder_cache else None,
+            )
+            logger.info("Embedder '%s' ready (device=%s)", model_name, device)
+        except Exception as e:
+            logger.warning(
+                "Unable to load SentenceTransformer (%s). DenseRetriever.retrieve() will return no hits.",
+                e,
+            )
+            self.embedder = None
         # ------------------------------------------------------------------
+        # ❷ Build FAISS index if absent, else try loading it
         if not self.faiss_index.exists():
             logger.info("FAISS index %s missing – building ...", self.faiss_index)
+            try:
+                self._build_index()
+            except Exception as e:
+                logger.warning(
+                    "Failed to build FAISS index (%s). DenseRetriever.retrieve() will return no hits.",
+                    e,
+                )
+        try:
+            self.index = faiss.read_index(str(self.faiss_index))
+            logger.info("Loaded FAISS index with %d vectors", self.index.ntotal)
+        except Exception as e:
+            logger.warning(
+                "Unable to load FAISS index (%s). DenseRetriever.retrieve() will return no hits.",
+                e,
+            )
+            self.index = None
+        # Keep doc texts in memory for convenience (if doc_store exists)
         self._texts: List[str] = []
+        if self.doc_store.exists():
+            try:
+                with self.doc_store.open() as f:
+                    for line in f:
+                        obj = json.loads(line)
+                        self._texts.append(obj.get("text", ""))
+            except Exception as e:
+                logger.warning(
+                    "Failed to load doc_store texts (%s). Retrieved contexts will have empty text.", e
+                )
+                self._texts = []
     def retrieve(self, query: str, *, top_k: int = 5) -> List[Context]:
+        # If embedder or index isn’t available, return empty list
+        if self.embedder is None or self.index is None:
+            return []
+        try:
+            # ❸ Embed the query, normalise, and search FAISS
+            qvec = self.embedder.encode([query], normalize_embeddings=True)
+            vec = np.asarray(qvec, dtype="float32")[None, :]
+            faiss.normalize_L2(vec)
+            dists, idxs = self.index.search(vec, top_k)
+            dists, idxs = dists[0], idxs[0]
+            results: List[Context] = []
+            for i, score in zip(idxs, dists):
+                if i < 0:
+                    continue
+                # If FAISS uses L2 metric, invert distance to score
+                if self.index.metric_type == faiss.METRIC_L2:
+                    score = -score
+                text = self._texts[i] if i < len(self._texts) else ""
+                results.append(Context(id=str(i), text=text, score=float(score)))
+            results.sort(key=lambda c: c.score, reverse=True)
+            return results
+        except Exception as e:
+            logger.warning(
+                "Error during DenseRetriever.retrieve (%s). Returning no hits.", e
+            )
+            return []
     def _build_index(self):
         """Read all texts, embed them, and write a FAISS IP index."""
         logger.info("Reading documents from %s", self.doc_store)
+        ids: List[int] = []
+        vectors: List[str] = []
         with self.doc_store.open() as f:
             for line in f:
                 obj = json.loads(line)

tests/test_dense_retriever.py CHANGED Viewed

@@ -1,26 +1,116 @@
-import faiss
 import numpy as np
 from pathlib import Path
 from evaluation.retrievers.dense import DenseRetriever
-def test_dense_retriever_build_and_search(tmp_doc_store, tmp_path):
-    faiss_index = tmp_path / "dense.index"
-    # Build index automatically
     retriever = DenseRetriever(
-        faiss_index=faiss_index,
-        doc_store=tmp_doc_store,
-        model_name="dummy/ignored",          # ignored by dummy embedder
         device="cpu",
     )
-    assert faiss_index.exists(), "FAISS index should have been autoâ€‘created"
-    # Basic retrieval
-    results = retriever.retrieve("What enables similarity search?", top_k=3)
-    assert results, "Should return at least one context"
-    # Check score ordering descending
-    assert all(results[i].score >= results[i + 1].score for i in range(len(results) - 1))
-    # IDs must be strings by contract
-    assert isinstance(results[0].id, str)

+import json
 import numpy as np
+import pytest
 from pathlib import Path
 from evaluation.retrievers.dense import DenseRetriever
+from evaluation.retrievers.base import Context
+import faiss  # type: ignore
+class DummyIndex:
+    def __init__(self):
+        # pretend we have 3 docs
+        self.ntotal = 3
+        self.metric_type = faiss.METRIC_INNER_PRODUCT if hasattr(faiss, "METRIC_INNER_PRODUCT") else faiss.METRIC_L2
+    def search(self, vec, top_k):
+        # Always return distances [0.1, 0.2, ...] and indices [0,1,2]
+        dists = np.array([[0.2, 0.15, 0.05]])
+        idxs = np.array([[0, 1, 2]])
+        return dists, idxs
+class DummyEmbedder:
+    def encode(self, texts, normalize_embeddings):
+        # Return a fixed-length embedding vector of size 4
+        return np.array([0.1, 0.2, 0.3, 0.4], dtype="float32")
+@pytest.fixture(autouse=True)
+def patch_faiss_and_transformer(monkeypatch):
+    # ❶ Stub out faiss.read_index
+    import faiss
+    monkeypatch.setattr(faiss, "read_index", lambda _: DummyIndex())
+    # ❷ Stub out SentenceTransformer
+    import sentence_transformers
+    monkeypatch.setattr(
+        sentence_transformers,
+        "SentenceTransformer",
+        lambda *args, **kwargs: DummyEmbedder(),
+    )
+    yield
+def test_dense_index_build_and_search(tmp_path):
+    # Create a dummy doc_store with 3 lines
+    docs = [
+        {"id": 0, "text": "Doc zero"},
+        {"id": 1, "text": "Doc one"},
+        {"id": 2, "text": "Doc two"},
+    ]
+    doc_store_path = tmp_path / "docs.jsonl"
+    with doc_store_path.open("w") as f:
+        for obj in docs:
+            f.write(json.dumps(obj) + "\n")
+    # Use a non‐existent FAISS index file path
+    faiss_idx = tmp_path / "index.faiss"
+    if faiss_idx.exists():
+        faiss_idx.unlink()
+    # Instantiate DenseRetriever → should call _build_index (which tries to embed & write),
+    # but our DummyEmbedder + faiss.read_index allow it to succeed silently.
     retriever = DenseRetriever(
+        faiss_index=faiss_idx,
+        doc_store=doc_store_path,
+        model_name="dummy-model-name",
         device="cpu",
     )
+    # FAISS index file should now exist
+    assert faiss_idx.exists()
+    # Now call retrieve(...)
+    results = retriever.retrieve("any query", top_k=3)
+    # We expect 3 Contexts (because DummyIndex returns idxs [0,1,2])
+    assert isinstance(results, list)
+    assert len(results) == 3
+    for i, ctx in enumerate(results):
+        assert isinstance(ctx, Context)
+        assert ctx.id == str(i)
+        # Since DummyIndex.metric_type is IP, we do not invert; check score type
+        assert isinstance(ctx.score, float)
+        # Text must come from the doc_store lines loaded above
+        assert ctx.text in {"Doc zero", "Doc one", "Doc two"}
+def test_dense_retrieve_when_faiss_or_transformer_fails(monkeypatch, tmp_path):
+    # Simulate faiss.read_index raising an exception
+    import faiss
+    monkeypatch.setattr(faiss, "read_index", lambda _: (_ for _ in ()).throw(Exception("fail")))
+    # Create a minimal doc_store
+    doc_store_path = tmp_path / "docs.jsonl"
+    doc_store_path.write_text('{"id":0,"text":"hello"}\n')
+    faiss_idx = tmp_path / "index2.faiss"
+    if faiss_idx.exists():
+        faiss_idx.unlink()
+    # Instantiate → embedder loads fine, but faiss.read_index fails, so index=None
+    retriever = DenseRetriever(
+        faiss_index=faiss_idx,
+        doc_store=doc_store_path,
+        model_name="dummy-model-name",
+        device="cpu",
+    )
+    # Because self.index is None, retrieve() must return []
+    assert retriever.retrieve("whatever", top_k=5) == []

tests/test_hybrid_retriever.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import pytest
+from pathlib import Path
+from evaluation.retrievers.base import Context
+from evaluation.retrievers.hybrid import HybridRetriever
+class DummyBM25:
+    def __init__(self, bm25_idx: str, doc_store: str):
+        pass
+    def retrieve(self, query: str, top_k: int):
+        # Return two contexts
+        return [
+            Context(id="a", text="bm25_doc_a", score=1.0),
+            Context(id="b", text="bm25_doc_b", score=0.5),
+        ]
+class DummyDense:
+    def __init__(self, faiss_idx: str, doc_store: str, model_name: str, embedder_cache: str, device: str):
+        pass
+    def retrieve(self, query: str, top_k: int):
+        # Return two contexts (one overlaps with BM25 'b')
+        return [
+            Context(id="b", text="dense_doc_b", score=0.8),
+            Context(id="c", text="dense_doc_c", score=0.3),
+        ]
+@pytest.fixture(autouse=True)
+def patch_internal_retrievers(monkeypatch):
+    import evaluation.retrievers.hybrid as hybrid_mod
+    # Monkey‐patch the classes that HybridRetriever uses internally
+    monkeypatch.setattr(hybrid_mod, "BM25Retriever", DummyBM25)
+    monkeypatch.setattr(hybrid_mod, "DenseRetriever", DummyDense)
+    yield
+def test_hybrid_retriever_combines_scores(tmp_path):
+    # Create dummy paths (they won’t be touched by DummyBM25/DummyDense)
+    bm25_idx = tmp_path / "bm25_index"
+    faiss_idx = tmp_path / "dense_index"
+    doc_store = tmp_path / "docs.jsonl"
+    doc_store.write_text('{"id":0,"text":"hello"}\n')
+    # alpha = 0.5 means equal weighting
+    hybrid = HybridRetriever(
+        bm25_idx=str(bm25_idx),
+        faiss_idx=str(faiss_idx),
+        doc_store=doc_store,
+        alpha=0.5,
+        model_name="ignored",
+        embedder_cache=None,
+        device="cpu",
+    )
+    # Request top_k=2 (both dummy retrievers ignore top_k)
+    results = hybrid.retrieve("dummy query", top_k=2)
+    # We expect:
+    # - 'a': only BM25, score = 0.5 * 1.0 + 0.5 * 0   = 0.5
+    # - 'b': both BM25 and Dense, score = 0.5 * 0.5 + 0.5 * 0.8 = 0.65
+    # - 'c': only Dense, score = 0.5 * 0   + 0.5 * 0.3 = 0.15
+    #
+    # Sorted descending by final score: b (0.65), a (0.5), c (0.15)
+    assert isinstance(results, list)
+    assert all(isinstance(r, Context) for r in results)
+    # Check order and computed scores
+    ids_in_order = [r.id for r in results]
+    scores = {r.id: r.score for r in results}
+    assert ids_in_order == ["b", "a", "c"]
+    assert scores["b"]==pytest.approx(0.65, rel=1e-6)
+    assert scores["a"]==pytest.approx(0.5, rel=1e-6)
+    assert scores["c"]==pytest.approx(0.15, rel=1e-6)

tests/test_sparse_retriever.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import json
+import subprocess
+from pathlib import Path
+import pytest
+from evaluation.retrievers.bm25 import BM25Retriever
+from evaluation.retrievers.base import Context
+class DummyHit:
+    def __init__(self, docid, raw, score):
+        self.docid = docid
+        self.raw = raw
+        self.score = score
+class DummySearcher:
+    def __init__(self, index_dir):
+        # do nothing
+        pass
+    def set_bm25(self):
+        pass
+    def search(self, query, k):
+        # Return a predictable list of hits
+        return [
+            DummyHit(docid=0, raw="first doc text", score=2.0),
+            DummyHit(docid=1, raw="second doc text", score=1.5),
+        ]
+@pytest.fixture(autouse=True)
+def patch_subprocess_and_pyserini(monkeypatch):
+    # ❶ Prevent subprocess.run from actually calling "pyserini.index"
+    monkeypatch.setattr(subprocess, "run", lambda *args, **kwargs: None)
+    # ❷ Stub out pyserini.search.SimpleSearcher
+    import pyserini.search
+    monkeypatch.setattr(pyserini.search, "SimpleSearcher", DummySearcher)
+def test_bm25_index_build_and_query(tmp_path):
+    # Create a tiny doc_store JSONL
+    docs = [
+        {"id": 0, "text": "Retrieval Augmented Generation"},
+        {"id": 1, "text": "BM25 is strong"},
+    ]
+    doc_store_path = tmp_path / "docs.jsonl"
+    with doc_store_path.open("w") as f:
+        for obj in docs:
+            f.write(json.dumps(obj) + "\n")
+    # Point to a non‐existent index directory
+    index_dir = tmp_path / "bm25_index"
+    assert not index_dir.exists()
+    # Instantiate BM25Retriever; __init__ should “build” the index (subprocess.run no‐ops)
+    retriever = BM25Retriever(index_path=str(index_dir), doc_store_path=str(doc_store_path))
+    # After init, index_dir “exists” (because build_index created it)
+    assert index_dir.exists()
+    # Now call retrieve(...)
+    results = retriever.retrieve("any query", top_k=2)
+    # Verify that we get two Context objects with correct fields
+    assert isinstance(results, list)
+    assert len(results) == 2
+    assert all(isinstance(r, Context) for r in results)
+    # Because DummySearcher returns docid=0 then docid=1
+    assert results[0].id == "0"
+    assert results[0].text == "first doc text"
+    assert results[0].score == pytest.approx(2.0, rel=1e-6)
+    assert results[1].id == "1"
+    assert results[1].text == "second doc text"
+    assert results[1].score == pytest.approx(1.5, rel=1e-6)
+def test_bm25_retrieve_when_pyserini_missing(monkeypatch, tmp_path):
+    # Simulate ImportError for pyserini.search.SimpleSearcher
+    import sys
+    # Remove pyserini.search.SimpleSearcher at import time
+    monkeypatch.setitem(sys.modules, "pyserini.search", None)
+    doc_store_path = tmp_path / "docs.jsonl"
+    doc_store_path.write_text('{"id":0,"text":"hello"}\n')
+    index_dir = tmp_path / "bm25_index2"
+    # This should not raise, but self.searcher will be None
+    retriever = BM25Retriever(index_path=str(index_dir), doc_store_path=str(doc_store_path))
+    # Because SimpleSearcher couldn't load, retrieve() must return an empty list
+    assert retriever.retrieve("whatever", top_k=5) == []