Spaces:

DataEyond
/

Agentic-Service-Data-Eyond

Sleeping

App Files Files Community

Rifqi Hafizuddin commited on 23 days ago

Commit

ac6b78d

1 Parent(s): e9f2a26

[KM-507] add multiple retrieval method to compare (dense, mmr, bm25, hybrid)

Browse files

Files changed (2) hide show

src/db/postgres/init_db.py +15 -0
src/rag/retrievers/schema.py +274 -43

src/db/postgres/init_db.py CHANGED Viewed

@@ -28,3 +28,18 @@ async def init_db():
         await conn.execute(text(
             "ALTER TABLE rooms ADD COLUMN IF NOT EXISTS status VARCHAR NOT NULL DEFAULT 'active'"
         ))

         await conn.execute(text(
             "ALTER TABLE rooms ADD COLUMN IF NOT EXISTS status VARCHAR NOT NULL DEFAULT 'active'"
         ))
+        # GIN index for FTS on schema chunks — only created if table exists
+        # (langchain_pg_embedding is created by PGVector on first use, not by create_all)
+        await conn.execute(text("""
+            DO $$
+            BEGIN
+                IF EXISTS (
+                    SELECT FROM information_schema.tables
+                    WHERE table_name = 'langchain_pg_embedding'
+                ) THEN
+                    CREATE INDEX IF NOT EXISTS idx_langchain_pg_embedding_fts
+                    ON langchain_pg_embedding USING GIN (to_tsvector('english', document));
+                END IF;
+            END $$
+        """))

src/rag/retrievers/schema.py CHANGED Viewed

@@ -1,86 +1,317 @@
 """Schema retriever — handles DB schemas (source_type="database") and tabular file
 columns stored as source_type="document" with file_type in ("csv","xlsx").
-Strategy: similarity search with score threshold on two metadata shapes,
-run in parallel, merged and re-ranked by score.
 """
 import asyncio
 from src.db.postgres.vector_store import get_vector_store
 from src.middlewares.logging import get_logger
 from src.rag.base import BaseRetriever, RetrievalResult
 logger = get_logger("schema_retriever")
-_SCORE_THRESHOLD = 0.45  # cosine distance — discard above this value
 _TABULAR_FILE_TYPES = ("csv", "xlsx")
 class SchemaRetriever(BaseRetriever):
     def __init__(self):
         self.vector_store = get_vector_store()
-    async def _search_db(self, query: str, user_id: str, k: int) -> list[RetrievalResult]:
-        """Retrieve DB schema chunks (source_type="database")."""
         docs_with_scores = await self.vector_store.asimilarity_search_with_score(
             query=query,
-            k=k,
             filter={"user_id": user_id, "source_type": "database"},
         )
         results = []
         for doc, distance in docs_with_scores:
-            if distance <= _SCORE_THRESHOLD:
-                results.append(
-                    RetrievalResult(
-                        content=doc.page_content,
-                        metadata=doc.metadata,
-                        score=1.0 - distance,
-                        source_type="database",
-                    )
                 )
         return results
-    async def _search_tabular(self, query: str, user_id: str, k: int) -> list[RetrievalResult]:
-        """Retrieve CSV/XLSX column chunks (source_type="document", file_type=csv|xlsx)."""
         results = []
-        for file_type in _TABULAR_FILE_TYPES:
-            docs_with_scores = await self.vector_store.asimilarity_search_with_score(
-                query=query,
-                k=k,
-                filter={
-                    "user_id": user_id,
-                    "source_type": "document",
-                    "data": {"file_type": file_type},
-                },
             )
-            for doc, distance in docs_with_scores:
-                if distance <= _SCORE_THRESHOLD:
-                    results.append(
-                        RetrievalResult(
-                            content=doc.page_content,
-                            metadata=doc.metadata,
-                            score=1.0 - distance,
-                            source_type="document",
-                        )
-                    )
         return results
-    async def retrieve(
-        self, query: str, user_id: str, k: int = 5
     ) -> list[RetrievalResult]:
         db_results, tabular_results = await asyncio.gather(
             self._search_db(query, user_id, k),
             self._search_tabular(query, user_id, k),
         )
-        combined = db_results + tabular_results
-        combined.sort(key=lambda r: r.score, reverse=True)
-        logger.info(
-            "schema retrieval",
-            db_chunks=len(db_results),
-            tabular_chunks=len(tabular_results),
         )
         return combined[:k]
 schema_retriever = SchemaRetriever()

 """Schema retriever — handles DB schemas (source_type="database") and tabular file
 columns stored as source_type="document" with file_type in ("csv","xlsx").
+Multiple retrieval strategies are exposed for benchmarking. The active strategy
+used by the router is `retrieve()`, which dispatches to ACTIVE_STRATEGY.
+Change ACTIVE_STRATEGY at module level to switch without touching the router.
 """
 import asyncio
+import time
+from typing import Literal
+from sqlalchemy import text
+from src.db.postgres.connection import _pgvector_engine
 from src.db.postgres.vector_store import get_vector_store
 from src.middlewares.logging import get_logger
 from src.rag.base import BaseRetriever, RetrievalResult
 logger = get_logger("schema_retriever")
+_SCORE_THRESHOLD = 0.60  # cosine distance — discard above this value (score < 0.40)
 _TABULAR_FILE_TYPES = ("csv", "xlsx")
+Strategy = Literal["dense", "dense_no_threshold", "mmr", "hybrid", "hybrid_bm25"]
+ACTIVE_STRATEGY: Strategy = "hybrid_bm25"
 class SchemaRetriever(BaseRetriever):
     def __init__(self):
         self.vector_store = get_vector_store()
+    # ------------------------------------------------------------------
+    # Internal search helpers
+    # ------------------------------------------------------------------
+    async def _search_db(
+        self, query: str, user_id: str, k: int, threshold: float | None = _SCORE_THRESHOLD
+    ) -> list[RetrievalResult]:
         docs_with_scores = await self.vector_store.asimilarity_search_with_score(
             query=query,
+            k=k * 4,  # fetch extra to survive dedup attrition from multiple ingestion runs
             filter={"user_id": user_id, "source_type": "database"},
         )
+        return [
+            RetrievalResult(
+                content=doc.page_content,
+                metadata=doc.metadata,
+                score=1.0 - distance,
+                source_type="database",
+            )
+            for doc, distance in docs_with_scores
+            if threshold is None or distance <= threshold
+        ]
+    async def _search_tabular(
+        self, query: str, user_id: str, k: int, threshold: float | None = _SCORE_THRESHOLD
+    ) -> list[RetrievalResult]:
+        # Fetch extra to account for post-filter attrition (non-tabular docs filtered out)
+        docs_with_scores = await self.vector_store.asimilarity_search_with_score(
+            query=query,
+            k=k * 4,
+            filter={"user_id": user_id, "source_type": "document"},
+        )
         results = []
         for doc, distance in docs_with_scores:
+            if doc.metadata.get("data", {}).get("file_type") not in _TABULAR_FILE_TYPES:
+                continue
+            if threshold is not None and distance > threshold:
+                continue
+            results.append(
+                RetrievalResult(
+                    content=doc.page_content,
+                    metadata=doc.metadata,
+                    score=1.0 - distance,
+                    source_type="document",
                 )
+            )
+            if len(results) >= k:
+                break
         return results
+    async def _search_db_mmr(self, query: str, user_id: str, k: int) -> list[RetrievalResult]:
+        docs = await self.vector_store.amax_marginal_relevance_search(
+            query=query,
+            k=k * 4,  # fetch extra to survive dedup attrition
+            fetch_k=k * 12,
+            filter={"user_id": user_id, "source_type": "database"},
+        )
+        return [
+            RetrievalResult(
+                content=doc.page_content,
+                metadata=doc.metadata,
+                score=0.0,  # MMR does not return scores
+                source_type="database",
+            )
+            for doc in docs
+        ]
+    async def _search_tabular_mmr(self, query: str, user_id: str, k: int) -> list[RetrievalResult]:
+        docs = await self.vector_store.amax_marginal_relevance_search(
+            query=query,
+            k=k * 4,
+            fetch_k=k * 12,
+            filter={"user_id": user_id, "source_type": "document"},
+        )
         results = []
+        for doc in docs:
+            if doc.metadata.get("data", {}).get("file_type") not in _TABULAR_FILE_TYPES:
+                continue
+            results.append(
+                RetrievalResult(
+                    content=doc.page_content,
+                    metadata=doc.metadata,
+                    score=0.0,
+                    source_type="document",
+                )
             )
+            if len(results) >= k:
+                break
         return results
+    async def _search_fts_db(self, query: str, user_id: str, k: int) -> list[RetrievalResult]:
+        """Full-text search over DB schema chunks using PostgreSQL tsvector.
+        Uses plainto_tsquery (natural language, no operator syntax required).
+        Requires the GIN index created by init_db.py on first startup after table exists.
+        ts_rank score is only used for ordering here; RRF ignores it.
+        """
+        sql = text("""
+            SELECT lpe.document, lpe.cmetadata,
+                   ts_rank(to_tsvector('english', lpe.document),
+                           plainto_tsquery('english', :query)) AS rank
+            FROM langchain_pg_embedding lpe
+            JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
+            WHERE lpc.name = 'document_embeddings'
+              AND lpe.cmetadata->>'user_id' = :user_id
+              AND lpe.cmetadata->>'source_type' = 'database'
+              AND to_tsvector('english', lpe.document) @@ plainto_tsquery('english', :query)
+            ORDER BY rank DESC
+            LIMIT :k
+        """)
+        async with _pgvector_engine.connect() as conn:
+            result = await conn.execute(sql, {"query": query, "user_id": user_id, "k": k})
+            rows = result.fetchall()
+        return [
+            RetrievalResult(
+                content=row.document,
+                metadata=row.cmetadata,
+                score=float(row.rank),
+                source_type="database",
+            )
+            for row in rows
+        ]
+    def _rrf_merge(
+        self,
+        *ranked_lists: list[RetrievalResult],
+        k_rrf: int = 60,
+        top_k: int = 5,
     ) -> list[RetrievalResult]:
+        """Reciprocal Rank Fusion — combines ranked lists using rank positions only.
+        Uses content prefix as dedup key so the same column appearing in multiple
+        lists is counted once with accumulated RRF score.
+        """
+        scores: dict[str, float] = {}
+        index: dict[str, RetrievalResult] = {}
+        for ranked in ranked_lists:
+            for rank, result in enumerate(ranked):
+                key = result.content[:120]
+                scores[key] = scores.get(key, 0.0) + 1.0 / (k_rrf + rank + 1)
+                index[key] = result
+        merged = sorted(index.values(), key=lambda r: scores[r.content[:120]], reverse=True)
+        return merged[:top_k]
+    def _dedup(self, results: list[RetrievalResult]) -> list[RetrievalResult]:
+        """Deduplicate by (table_name, column_name), keeping highest score per unique column.
+        Multiple ingestion runs of the same DB produce identical chunks — this collapses
+        them so the LLM context only sees one chunk per column.
+        """
+        seen: dict[tuple, RetrievalResult] = {}
+        for r in results:
+            data = r.metadata.get("data", {})
+            key = (data.get("table_name"), data.get("column_name") or data.get("filename"))
+            if key not in seen or r.score > seen[key].score:
+                seen[key] = r
+        return sorted(seen.values(), key=lambda r: r.score, reverse=True)
+    # ------------------------------------------------------------------
+    # Named strategies — call directly from benchmark / test scripts
+    # ------------------------------------------------------------------
+    async def dense(self, query: str, user_id: str, k: int = 5) -> list[RetrievalResult]:
+        """Dense similarity with score threshold. Current production default."""
         db_results, tabular_results = await asyncio.gather(
             self._search_db(query, user_id, k),
             self._search_tabular(query, user_id, k),
         )
+        combined = self._dedup(db_results + tabular_results)
+        return combined[:k]
+    async def dense_no_threshold(self, query: str, user_id: str, k: int = 5) -> list[RetrievalResult]:
+        """Dense similarity without score cutoff.
+        Use to calibrate whether the threshold is too strict/loose —
+        compare returned chunks against `dense` to see what gets filtered out.
+        """
+        db_results, tabular_results = await asyncio.gather(
+            self._search_db(query, user_id, k, threshold=None),
+            self._search_tabular(query, user_id, k, threshold=None),
         )
+        combined = self._dedup(db_results + tabular_results)
         return combined[:k]
+    async def mmr(self, query: str, user_id: str, k: int = 5) -> list[RetrievalResult]:
+        """MMR (Maximal Marginal Relevance) for diversity.
+        Note: scores are 0.0 — MMR does not expose similarity scores.
+        Dedup still applied since multiple ingestion runs produce duplicate chunks.
+        """
+        db_results, tabular_results = await asyncio.gather(
+            self._search_db_mmr(query, user_id, k),
+            self._search_tabular_mmr(query, user_id, k),
+        )
+        return self._dedup(db_results + tabular_results)[:k]
+    async def hybrid(self, query: str, user_id: str, k: int = 5) -> list[RetrievalResult]:
+        """RRF merge of dense + MMR results.
+        Acts as a proxy for a true dense+FTS hybrid until a PostgreSQL tsvector
+        GIN index is added. Dense covers semantic queries; the second ranking
+        signal from MMR helps surface exact-name matches that dense ranks lower.
+        To upgrade to true FTS hybrid: replace mmr() leg with _search_fts()
+        (raw SQL using to_tsquery) and add the GIN index in init_db.py.
+        """
+        dense_results, mmr_results = await asyncio.gather(
+            self.dense(query, user_id, k),
+            self.mmr(query, user_id, k),
+        )
+        return self._rrf_merge(dense_results, mmr_results, top_k=k)
+    async def hybrid_bm25(self, query: str, user_id: str, k: int = 5) -> list[RetrievalResult]:
+        """RRF merge of dense + PostgreSQL FTS (true hybrid).
+        Dense handles semantic queries ("customer information", "revenue columns").
+        FTS handles structural/exact terms that appear literally in chunks:
+        [PRIMARY KEY], [FK ->], column type strings, exact column/table names.
+        FTS results are deduped by (table_name, column_name) before merge to prevent
+        multiple ingestion runs from accumulating RRF score unfairly.
+        Requires GIN index on langchain_pg_embedding.document (created by init_db.py).
+        """
+        dense_results, fts_results = await asyncio.gather(
+            self.dense(query, user_id, k),
+            self._search_fts_db(query, user_id, k * 4),
+        )
+        return self._rrf_merge(dense_results, self._dedup(fts_results), top_k=k)
+    # ------------------------------------------------------------------
+    # Public interface — called by the router
+    # ------------------------------------------------------------------
+    async def retrieve(self, query: str, user_id: str, k: int = 5) -> list[RetrievalResult]:
+        strategy_fn = getattr(self, ACTIVE_STRATEGY)
+        results = await strategy_fn(query, user_id, k)
+        logger.info("schema retrieval", strategy=ACTIVE_STRATEGY, count=len(results))
+        return results
+# ------------------------------------------------------------------
+# Benchmark helper — import in test scripts
+# ------------------------------------------------------------------
+async def benchmark(
+    query: str,
+    user_id: str,
+    k: int = 5,
+    strategies: list[Strategy] | None = None,
+) -> dict[str, dict]:
+    """Run multiple strategies against the same query and return timing + results.
+    Strategies run sequentially so timings are isolated (not competing for the
+    same DB connections). Scores and chunk content are included for manual review.
+    Usage:
+        from src.rag.retrievers.schema import benchmark
+        report = await benchmark("what is the primary key of orders?", user_id="xxx")
+    """
+    retriever = SchemaRetriever()
+    targets: list[Strategy] = strategies or ["dense", "dense_no_threshold", "mmr", "hybrid", "hybrid_bm25"]
+    report: dict[str, dict] = {}
+    for name in targets:
+        fn = getattr(retriever, name)
+        t0 = time.perf_counter()
+        chunks = await fn(query, user_id, k)
+        elapsed_ms = round((time.perf_counter() - t0) * 1000)
+        total_chars = sum(len(r.content) for r in chunks)
+        report[name] = {
+            "chunks": len(chunks),
+            "estimated_tokens": total_chars // 4,
+            "elapsed_ms": elapsed_ms,
+            "results": chunks,
+        }
+    return report
 schema_retriever = SchemaRetriever()