Spaces:

DataEyond
/

Agentic-Service-Data-Eyond

Sleeping

App Files Files Community

Rifqi Hafizuddin commited on 17 days ago

Commit

145bca3

1 Parent(s): 83ed744

[KM-507] add different methods, now using dense cosine

Browse files

Files changed (1) hide show

src/rag/retrievers/schema.py +161 -136

src/rag/retrievers/schema.py CHANGED Viewed

@@ -4,6 +4,15 @@ columns stored as source_type="document" with file_type in ("csv","xlsx").
 Multiple retrieval strategies are exposed for benchmarking. The active strategy
 used by the router is `retrieve()`, which dispatches to ACTIVE_STRATEGY.
 Change ACTIVE_STRATEGY at module level to switch without touching the router.
 """
 import asyncio
@@ -19,10 +28,9 @@ from src.rag.base import BaseRetriever, RetrievalResult
 logger = get_logger("schema_retriever")
-_SCORE_THRESHOLD = 0.75  # cosine distance — discard above this value (score < 0.25)
 _TABULAR_FILE_TYPES = ("csv", "xlsx")
-Strategy = Literal["dense", "dense_no_threshold", "mmr", "hybrid", "hybrid_bm25"]
 ACTIVE_STRATEGY: Strategy = "dense_no_threshold"
@@ -31,88 +39,87 @@ class SchemaRetriever(BaseRetriever):
         self.vector_store = get_vector_store()
     # ------------------------------------------------------------------
-    # Internal search helpers
     # ------------------------------------------------------------------
     async def _search_db(
-        self, query: str, user_id: str, k: int, threshold: float | None = _SCORE_THRESHOLD
     ) -> list[RetrievalResult]:
-        docs_with_scores = await self.vector_store.asimilarity_search_with_score(
-            query=query,
-            k=k * 4,  # fetch extra to survive dedup attrition from multiple ingestion runs
-            filter={"user_id": user_id, "source_type": "database"},
-        )
         return [
             RetrievalResult(
-                content=doc.page_content,
-                metadata=doc.metadata,
-                score=1.0 - distance,
                 source_type="database",
             )
-            for doc, distance in docs_with_scores
-            if threshold is None or distance <= threshold
         ]
     async def _search_tabular(
-        self, query: str, user_id: str, k: int, threshold: float | None = _SCORE_THRESHOLD
     ) -> list[RetrievalResult]:
-        # Fetch extra to account for post-filter attrition (non-tabular docs filtered out)
-        docs_with_scores = await self.vector_store.asimilarity_search_with_score(
-            query=query,
-            k=k * 4,
-            filter={"user_id": user_id, "source_type": "document"},
-        )
-        results = []
-        for doc, distance in docs_with_scores:
-            if doc.metadata.get("data", {}).get("file_type") not in _TABULAR_FILE_TYPES:
-                continue
-            if threshold is not None and distance > threshold:
-                continue
-            results.append(
-                RetrievalResult(
-                    content=doc.page_content,
-                    metadata=doc.metadata,
-                    score=1.0 - distance,
-                    source_type="document",
-                )
-            )
-            if len(results) >= k:
-                break
-        return results
-    async def _search_db_mmr(self, query: str, user_id: str, k: int) -> list[RetrievalResult]:
-        docs = await self.vector_store.amax_marginal_relevance_search(
-            query=query,
-            k=k * 4,  # fetch extra to survive dedup attrition
-            fetch_k=k * 12,
-            filter={"user_id": user_id, "source_type": "database"},
-        )
-        return [
-            RetrievalResult(
-                content=doc.page_content,
-                metadata=doc.metadata,
-                score=0.0,  # MMR does not return scores
-                source_type="database",
-            )
-            for doc in docs
-        ]
-    async def _search_tabular_mmr(self, query: str, user_id: str, k: int) -> list[RetrievalResult]:
-        docs = await self.vector_store.amax_marginal_relevance_search(
-            query=query,
-            k=k * 4,
-            fetch_k=k * 12,
-            filter={"user_id": user_id, "source_type": "document"},
-        )
         results = []
-        for doc in docs:
-            if doc.metadata.get("data", {}).get("file_type") not in _TABULAR_FILE_TYPES:
-                continue
             results.append(
                 RetrievalResult(
-                    content=doc.page_content,
-                    metadata=doc.metadata,
-                    score=0.0,
                     source_type="document",
                 )
             )
@@ -123,9 +130,7 @@ class SchemaRetriever(BaseRetriever):
     async def _search_fts_db(self, query: str, user_id: str, k: int) -> list[RetrievalResult]:
         """Full-text search over DB schema chunks using PostgreSQL tsvector.
-        Uses plainto_tsquery (natural language, no operator syntax required).
-        Requires the GIN index created by init_db.py on first startup after table exists.
-        ts_rank score is only used for ordering here; RRF ignores it.
         """
         sql = text("""
             SELECT lpe.document, lpe.cmetadata,
@@ -155,26 +160,53 @@ class SchemaRetriever(BaseRetriever):
             for row in rows
         ]
     def _rrf_merge(
         self,
         *ranked_lists: list[RetrievalResult],
         k_rrf: int = 60,
         top_k: int = 5,
     ) -> list[RetrievalResult]:
-        """Reciprocal Rank Fusion — combines ranked lists using rank positions only.
-        Uses content prefix as dedup key so the same column appearing in multiple
-        lists is counted once with accumulated RRF score.
-        """
-        scores: dict[str, float] = {}
-        index: dict[str, RetrievalResult] = {}
         for ranked in ranked_lists:
             for rank, result in enumerate(ranked):
                 data = result.metadata.get("data", {})
                 key = (data.get("table_name"), data.get("column_name") or data.get("filename"))
                 scores[key] = scores.get(key, 0.0) + 1.0 / (k_rrf + rank + 1)
-                # prefer the result with a real cosine score (dense leg) over ts_rank (FTS leg)
                 if key not in index or result.score > index[key].score:
                     index[key] = result
@@ -186,11 +218,7 @@ class SchemaRetriever(BaseRetriever):
         return merged[:top_k]
     def _dedup(self, results: list[RetrievalResult]) -> list[RetrievalResult]:
-        """Deduplicate by (table_name, column_name), keeping highest score per unique column.
-        Multiple ingestion runs of the same DB produce identical chunks — this collapses
-        them so the LLM context only sees one chunk per column.
-        """
         seen: dict[tuple, RetrievalResult] = {}
         for r in results:
             data = r.metadata.get("data", {})
@@ -200,75 +228,74 @@ class SchemaRetriever(BaseRetriever):
         return sorted(seen.values(), key=lambda r: r.score, reverse=True)
     # ------------------------------------------------------------------
-    # Named strategies — call directly from benchmark / test scripts
     # ------------------------------------------------------------------
-    async def dense(self, query: str, user_id: str, k: int = 5) -> list[RetrievalResult]:
-        """Dense similarity with score threshold. Current production default."""
         db_results, tabular_results = await asyncio.gather(
-            self._search_db(query, user_id, k),
-            self._search_tabular(query, user_id, k),
         )
-        combined = self._dedup(db_results + tabular_results)
-        return combined[:k]
-    async def dense_no_threshold(self, query: str, user_id: str, k: int = 5) -> list[RetrievalResult]:
-        """Dense similarity without score cutoff.
-        Use to calibrate whether the threshold is too strict/loose —
-        compare returned chunks against `dense` to see what gets filtered out.
         """
         db_results, tabular_results = await asyncio.gather(
-            self._search_db(query, user_id, k, threshold=None),
-            self._search_tabular(query, user_id, k, threshold=None),
         )
-        combined = self._dedup(db_results + tabular_results)
-        return combined[:k]
-    async def mmr(self, query: str, user_id: str, k: int = 5) -> list[RetrievalResult]:
-        """MMR (Maximal Marginal Relevance) for diversity.
-        Note: scores are 0.0 — MMR does not expose similarity scores.
-        Dedup still applied since multiple ingestion runs produce duplicate chunks.
         """
         db_results, tabular_results = await asyncio.gather(
-            self._search_db_mmr(query, user_id, k),
-            self._search_tabular_mmr(query, user_id, k),
         )
         return self._dedup(db_results + tabular_results)[:k]
     async def hybrid(self, query: str, user_id: str, k: int = 5) -> list[RetrievalResult]:
-        """RRF merge of dense + MMR results.
-        Acts as a proxy for a true dense+FTS hybrid until a PostgreSQL tsvector
-        GIN index is added. Dense covers semantic queries; the second ranking
-        signal from MMR helps surface exact-name matches that dense ranks lower.
-        To upgrade to true FTS hybrid: replace mmr() leg with _search_fts()
-        (raw SQL using to_tsquery) and add the GIN index in init_db.py.
         """
-        dense_results, mmr_results = await asyncio.gather(
-            self.dense(query, user_id, k),
-            self.mmr(query, user_id, k),
         )
-        return self._rrf_merge(dense_results, mmr_results, top_k=k)
     async def hybrid_bm25(self, query: str, user_id: str, k: int = 5) -> list[RetrievalResult]:
-        """RRF merge of dense + PostgreSQL FTS (true hybrid).
-        Dense handles semantic queries ("customer information", "revenue columns").
-        FTS handles structural/exact terms that appear literally in chunks:
-        [PRIMARY KEY], [FK ->], column type strings, exact column/table names.
-        FTS results are deduped by (table_name, column_name) before merge to prevent
-        multiple ingestion runs from accumulating RRF score unfairly.
-        Requires GIN index on langchain_pg_embedding.document (created by init_db.py).
         """
-        dense_results, fts_results = await asyncio.gather(
-            self.dense(query, user_id, k),
             self._search_fts_db(query, user_id, k * 4),
         )
-        return self._rrf_merge(dense_results, self._dedup(fts_results), top_k=k)
     # ------------------------------------------------------------------
     # Public interface — called by the router
@@ -291,17 +318,15 @@ async def benchmark(
     k: int = 5,
     strategies: list[Strategy] | None = None,
 ) -> dict[str, dict]:
-    """Run multiple strategies against the same query and return timing + results.
-    Strategies run sequentially so timings are isolated (not competing for the
-    same DB connections). Scores and chunk content are included for manual review.
-    Usage:
-        from src.rag.retrievers.schema import benchmark
-        report = await benchmark("what is the primary key of orders?", user_id="xxx")
-    """
     retriever = SchemaRetriever()
-    targets: list[Strategy] = strategies or ["dense", "dense_no_threshold", "mmr", "hybrid", "hybrid_bm25"]
     report: dict[str, dict] = {}
     for name in targets:

 Multiple retrieval strategies are exposed for benchmarking. The active strategy
 used by the router is `retrieve()`, which dispatches to ACTIVE_STRATEGY.
 Change ACTIVE_STRATEGY at module level to switch without touching the router.
+All strategies embed the query exactly once, then fan out to parallel SQL legs.
+Vector distance strategies:
+  dense_no_threshold  — cosine (<=>), no score floor, always returns k chunks
+  dense_dot           — inner product (<#>), equivalent to cosine for normalized embeddings
+  dense_l2            — L2/euclidean (<->), monotonic with cosine on unit-sphere vectors
+  hybrid              — RRF merge of dense + FTS (database + tabular)
+  hybrid_bm25         — RRF merge of dense + FTS (database only)
 """
 import asyncio
 logger = get_logger("schema_retriever")
 _TABULAR_FILE_TYPES = ("csv", "xlsx")
+Strategy = Literal["dense_no_threshold", "dense_dot", "dense_l2", "hybrid", "hybrid_bm25"]
 ACTIVE_STRATEGY: Strategy = "dense_no_threshold"
         self.vector_store = get_vector_store()
     # ------------------------------------------------------------------
+    # Internal helpers
     # ------------------------------------------------------------------
+    async def _embed_query(self, query: str) -> list[float]:
+        return await asyncio.to_thread(self.vector_store.embeddings.embed_query, query)
     async def _search_db(
+        self, embedding: list[float], user_id: str, k: int, operator: str = "<=>"
     ) -> list[RetrievalResult]:
+        """Vector search over database chunks. Accepts a pre-computed embedding."""
+        emb_str = "[" + ",".join(str(x) for x in embedding) + "]"
+        if operator == "<#>":
+            score_sql = f"(lpe.embedding <#> '{emb_str}'::vector) * -1"
+        elif operator == "<->":
+            score_sql = f"1.0 / (1.0 + (lpe.embedding <-> '{emb_str}'::vector))"
+        else:
+            score_sql = f"1.0 - (lpe.embedding <=> '{emb_str}'::vector)"
+        sql = text(f"""
+            SELECT lpe.document, lpe.cmetadata, {score_sql} AS score
+            FROM langchain_pg_embedding lpe
+            JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
+            WHERE lpc.name = 'document_embeddings'
+              AND lpe.cmetadata->>'user_id' = :user_id
+              AND lpe.cmetadata->>'source_type' = 'database'
+            ORDER BY lpe.embedding {operator} '{emb_str}'::vector ASC
+            LIMIT :k
+        """)
+        async with _pgvector_engine.connect() as conn:
+            result = await conn.execute(sql, {"user_id": user_id, "k": k * 4})
+            rows = result.fetchall()
         return [
             RetrievalResult(
+                content=row.document,
+                metadata=row.cmetadata,
+                score=float(row.score),
                 source_type="database",
             )
+            for row in rows
         ]
     async def _search_tabular(
+        self, embedding: list[float], user_id: str, k: int, operator: str = "<=>"
     ) -> list[RetrievalResult]:
+        """Vector search over tabular document chunks. Accepts a pre-computed embedding."""
+        emb_str = "[" + ",".join(str(x) for x in embedding) + "]"
+        if operator == "<#>":
+            score_sql = f"(lpe.embedding <#> '{emb_str}'::vector) * -1"
+        elif operator == "<->":
+            score_sql = f"1.0 / (1.0 + (lpe.embedding <-> '{emb_str}'::vector))"
+        else:
+            score_sql = f"1.0 - (lpe.embedding <=> '{emb_str}'::vector)"
+        sql = text(f"""
+            SELECT lpe.document, lpe.cmetadata, {score_sql} AS score
+            FROM langchain_pg_embedding lpe
+            JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
+            WHERE lpc.name = 'document_embeddings'
+              AND lpe.cmetadata->>'user_id' = :user_id
+              AND lpe.cmetadata->>'source_type' = 'document'
+              AND (lpe.cmetadata->'data'->>'file_type' = 'csv'
+                OR lpe.cmetadata->'data'->>'file_type' = 'xlsx')
+            ORDER BY lpe.embedding {operator} '{emb_str}'::vector ASC
+            LIMIT :k
+        """)
+        async with _pgvector_engine.connect() as conn:
+            result = await conn.execute(sql, {"user_id": user_id, "k": k * 4})
+            rows = result.fetchall()
         results = []
+        for row in rows:
             results.append(
                 RetrievalResult(
+                    content=row.document,
+                    metadata=row.cmetadata,
+                    score=float(row.score),
                     source_type="document",
                 )
             )
     async def _search_fts_db(self, query: str, user_id: str, k: int) -> list[RetrievalResult]:
         """Full-text search over DB schema chunks using PostgreSQL tsvector.
+        Requires GIN index on langchain_pg_embedding.document (created by init_db.py).
         """
         sql = text("""
             SELECT lpe.document, lpe.cmetadata,
             for row in rows
         ]
+    async def _search_fts_tabular(self, query: str, user_id: str, k: int) -> list[RetrievalResult]:
+        """Full-text search over tabular document chunks using PostgreSQL tsvector."""
+        sql = text("""
+            SELECT lpe.document, lpe.cmetadata,
+                   ts_rank(to_tsvector('english', lpe.document),
+                           plainto_tsquery('english', :query)) AS rank
+            FROM langchain_pg_embedding lpe
+            JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
+            WHERE lpc.name = 'document_embeddings'
+              AND lpe.cmetadata->>'user_id' = :user_id
+              AND lpe.cmetadata->>'source_type' = 'document'
+              AND (lpe.cmetadata->'data'->>'file_type' = 'csv'
+                OR lpe.cmetadata->'data'->>'file_type' = 'xlsx')
+              AND to_tsvector('english', lpe.document) @@ plainto_tsquery('english', :query)
+            ORDER BY rank DESC
+            LIMIT :k
+        """)
+        async with _pgvector_engine.connect() as conn:
+            result = await conn.execute(sql, {"query": query, "user_id": user_id, "k": k})
+            rows = result.fetchall()
+        return [
+            RetrievalResult(
+                content=row.document,
+                metadata=row.cmetadata,
+                score=float(row.rank),
+                source_type="document",
+            )
+            for row in rows
+        ]
     def _rrf_merge(
         self,
         *ranked_lists: list[RetrievalResult],
         k_rrf: int = 60,
         top_k: int = 5,
     ) -> list[RetrievalResult]:
+        """Reciprocal Rank Fusion — combines ranked lists using rank positions only."""
+        scores: dict[tuple, float] = {}
+        index: dict[tuple, RetrievalResult] = {}
         for ranked in ranked_lists:
             for rank, result in enumerate(ranked):
                 data = result.metadata.get("data", {})
                 key = (data.get("table_name"), data.get("column_name") or data.get("filename"))
                 scores[key] = scores.get(key, 0.0) + 1.0 / (k_rrf + rank + 1)
                 if key not in index or result.score > index[key].score:
                     index[key] = result
         return merged[:top_k]
     def _dedup(self, results: list[RetrievalResult]) -> list[RetrievalResult]:
+        """Deduplicate by (table_name, column_name), keeping highest score per unique column."""
         seen: dict[tuple, RetrievalResult] = {}
         for r in results:
             data = r.metadata.get("data", {})
         return sorted(seen.values(), key=lambda r: r.score, reverse=True)
     # ------------------------------------------------------------------
+    # Named strategies — one embed call each, legs run in parallel
     # ------------------------------------------------------------------
+    async def dense_no_threshold(self, query: str, user_id: str, k: int = 5) -> list[RetrievalResult]:
+        """Cosine similarity, no score cutoff — always returns k chunks."""
+        embedding = await self._embed_query(query)
         db_results, tabular_results = await asyncio.gather(
+            self._search_db(embedding, user_id, k),
+            self._search_tabular(embedding, user_id, k),
         )
+        return self._dedup(db_results + tabular_results)[:k]
+    async def dense_dot(self, query: str, user_id: str, k: int = 5) -> list[RetrievalResult]:
+        """Inner product similarity (<#>).
+        For L2-normalized embeddings (OpenAI), ranking is identical to cosine.
+        Score = raw inner product (not bounded to [0,1]).
         """
+        embedding = await self._embed_query(query)
         db_results, tabular_results = await asyncio.gather(
+            self._search_db(embedding, user_id, k, "<#>"),
+            self._search_tabular(embedding, user_id, k, "<#>"),
         )
+        return self._dedup(db_results + tabular_results)[:k]
+    async def dense_l2(self, query: str, user_id: str, k: int = 5) -> list[RetrievalResult]:
+        """L2 (Euclidean) distance similarity (<->).
+        For L2-normalized embeddings (OpenAI), ranking order matches cosine.
+        Score = 1 / (1 + l2_distance), bounded to (0, 1].
         """
+        embedding = await self._embed_query(query)
         db_results, tabular_results = await asyncio.gather(
+            self._search_db(embedding, user_id, k, "<->"),
+            self._search_tabular(embedding, user_id, k, "<->"),
         )
         return self._dedup(db_results + tabular_results)[:k]
     async def hybrid(self, query: str, user_id: str, k: int = 5) -> list[RetrievalResult]:
+        """RRF merge of dense + FTS over both database and tabular sources.
+        Embeds once, then runs all four legs (dense db, dense tabular, fts db,
+        fts tabular) in a single asyncio.gather.
         """
+        embedding = await self._embed_query(query)
+        db_results, tabular_results, fts_db, fts_tabular = await asyncio.gather(
+            self._search_db(embedding, user_id, k),
+            self._search_tabular(embedding, user_id, k),
+            self._search_fts_db(query, user_id, k * 4),
+            self._search_fts_tabular(query, user_id, k * 4),
         )
+        dense = self._dedup(db_results + tabular_results)[:k]
+        fts_all = self._dedup(fts_db + fts_tabular)
+        return self._rrf_merge(dense, fts_all, top_k=k)
     async def hybrid_bm25(self, query: str, user_id: str, k: int = 5) -> list[RetrievalResult]:
+        """RRF merge of dense + FTS (database chunks only).
+        Embeds once, then runs dense db, dense tabular, and fts db legs in parallel.
         """
+        embedding = await self._embed_query(query)
+        db_results, tabular_results, fts_results = await asyncio.gather(
+            self._search_db(embedding, user_id, k),
+            self._search_tabular(embedding, user_id, k),
             self._search_fts_db(query, user_id, k * 4),
         )
+        dense = self._dedup(db_results + tabular_results)[:k]
+        return self._rrf_merge(dense, self._dedup(fts_results), top_k=k)
     # ------------------------------------------------------------------
     # Public interface — called by the router
     k: int = 5,
     strategies: list[Strategy] | None = None,
 ) -> dict[str, dict]:
+    """Run multiple strategies against the same query and return timing + results."""
     retriever = SchemaRetriever()
+    targets: list[Strategy] = strategies or [
+        "dense_no_threshold",
+        "dense_dot",
+        "dense_l2",
+        "hybrid",
+        "hybrid_bm25",
+    ]
     report: dict[str, dict] = {}
     for name in targets: