RAG_Eval / tests /conftest.py
Rom89823974978's picture
Updated work
cdf4160
import json
import shutil
import tempfile
from pathlib import Path
from types import SimpleNamespace
from typing import List
import numpy as np
import pytest
@pytest.fixture(scope="session")
def tmp_doc_store(tmp_path_factory):
"""Create a tiny JSONL doc store for testing."""
docs = [
{"id": 0, "text": "Retrieval Augmented Generation combines retrieval and generation."},
{"id": 1, "text": "BM25 is a strong lexical baseline in information retrieval."},
{"id": 2, "text": "FAISS enables efficient similarity search over dense embeddings."},
]
doc_path = tmp_path_factory.mktemp("docs") / "docs.jsonl"
with doc_path.open("w") as f:
for doc in docs:
f.write(json.dumps(doc) + "\n")
return doc_path
class _DummyEmbedder:
"""Fast, deterministic replacement for SentenceTransformer during tests.
* Encodes text into a 16‑dim vector with a fixed random seed.
* Normalises vectors so the retriever workflow (IP metric) is preserved.
"""
_dim = 16
def __init__(self, *args, **kwargs):
self.rs = np.random.RandomState(42)
def encode(self, texts, **kw):
if isinstance(texts, str):
texts = [texts]
vecs = []
for t in texts:
# Simple hash-based seed for determinism
h = abs(hash(t)) % (2**32)
self.rs.seed(h)
v = self.rs.randn(self._dim)
v = v / np.linalg.norm(v)
vecs.append(v.astype("float32"))
return np.stack(vecs)
# SentenceTransformer.elasticsearch compatibility
def __str__(self):
return "DummyEmbedder"
@pytest.fixture(autouse=True)
def patch_sentence_transformers(monkeypatch):
"""Monkeypatch SentenceTransformer to a lightweight dummy implementation."""
# Import path inside our retriever module
from evaluation.retrievers import dense as dense_mod
monkeypatch.setattr(dense_mod, "SentenceTransformer", _DummyEmbedder)
yield