Spaces:

1337XCode
/

personabot-api

Running

App Files Files Community

GitHub Actions commited on 15 days ago

Commit

acfcc03

1 Parent(s): 5941cd9

Deploy 91b3f47

Browse files

Files changed (14) hide show

app/api/chat.py +23 -15
app/main.py +19 -9
app/pipeline/graph.py +1 -1
app/pipeline/nodes/cache.py +15 -4
app/pipeline/nodes/gemini_fast.py +6 -3
app/pipeline/nodes/generate.py +5 -5
app/pipeline/nodes/guard.py +1 -1
app/pipeline/nodes/log_eval.py +63 -53
app/pipeline/nodes/retrieve.py +50 -68
app/services/loki_sink.py +17 -5
tests/test_cache_reference_tokens.py +11 -0
tests/test_chat_source_filtering.py +12 -0
tests/test_generate_quality_fallback.py +1 -1
tests/test_log_eval_privacy.py +48 -1

app/api/chat.py CHANGED Viewed

@@ -2,6 +2,7 @@ import asyncio
 import json
 import re
 import time
 from fastapi import APIRouter, Request, Depends
 from fastapi.responses import StreamingResponse
@@ -11,6 +12,7 @@ from app.security.rate_limiter import chat_rate_limit
 from app.security.jwt_auth import verify_jwt
 router = APIRouter()
 # Keep-alive interval for SSE when upstream nodes are still working.
 # Prevents edge/proxy idle timeouts on long retrieval/generation turns.
@@ -22,14 +24,14 @@ _EXPANSION_TIMEOUT_SECONDS: float = 0.60
 # Phrases a visitor uses when telling the bot it gave a wrong answer.
 # Matched on the lowercased raw message before any LLM call — O(1), zero cost.
-_CRITICISM_SIGNALS: frozenset[str] = frozenset({
     "that's wrong", "thats wrong", "you're wrong", "youre wrong",
     "not right", "wrong answer", "you got it wrong", "that is wrong",
     "that's incorrect", "you're incorrect", "thats incorrect", "youre incorrect",
     "fix that", "fix your answer", "actually no", "no that's", "no thats",
     "that was wrong", "your answer was wrong", "wrong information",
     "incorrect information", "that's not right", "thats not right",
-})
 def _is_criticism(message: str) -> bool:
@@ -52,11 +54,11 @@ def _filter_sources_by_citations(answer: str, sources: list) -> list:
     if not cited_nums:
         return sources
-    max_cited = max(cited_nums)
-    if max_cited > len(sources):
         return sources
-    return [s for i, s in enumerate(sources, start=1) if i in cited_nums]
 async def _generate_follow_ups(
@@ -124,7 +126,6 @@ async def _update_summary_async(
     previous_summary: str | None,
     query: str,
     answer: str,
-    processing_api_key: str | None,
 ) -> None:
     """
     Triggered post-response to update the rolling conversation summary.
@@ -135,7 +136,6 @@ async def _update_summary_async(
             previous_summary=previous_summary or "",
             new_turn_q=query,
             new_turn_a=answer[:600],  # cap answer chars sent to Gemini
-            processing_api_key=processing_api_key,
         )
         if new_summary:
             conv_store.set_summary(session_id, new_summary)
@@ -143,12 +143,17 @@ async def _update_summary_async(
         pass
 @router.post("")
 @chat_rate_limit()
 async def chat_endpoint(
     request: Request,
     request_data: ChatRequest,
-    token_payload: dict = Depends(verify_jwt),
 ) -> StreamingResponse:
     """Stream RAG answer as typed SSE events.
@@ -344,19 +349,22 @@ async def chat_endpoint(
             # ── Follow-up questions ────────────────────────────────────────────
             # Generated after the done event so it never delays answer delivery.
             if final_answer and not await request.is_disconnected():
-                follow_ups = await _generate_follow_ups(
-                    request_data.message, final_answer, final_sources, llm_client
                 )
                 if follow_ups:
                     yield f"event: follow_ups\ndata: {json.dumps({'questions': follow_ups})}\n\n"
             # Stage 2: update rolling summary asynchronously — fired after the
             # response is fully delivered so it adds zero latency to the turn.
             if final_answer and gemini_client and gemini_client.is_configured:
-                processing_key = getattr(
-                    request.app.state, "gemini_processing_api_key", None
-                )
-                asyncio.create_task(
                     _update_summary_async(
                         conv_store=conv_store,
                         gemini_client=gemini_client,
@@ -364,9 +372,9 @@ async def chat_endpoint(
                         previous_summary=conversation_summary,
                         query=request_data.message,
                         answer=final_answer,
-                        processing_api_key=processing_key,
                     )
                 )
         except Exception as exc:
             yield f"data: {json.dumps({'error': str(exc) or 'Generation failed'})}\n\n"

 import json
 import re
 import time
+from typing import Annotated
 from fastapi import APIRouter, Request, Depends
 from fastapi.responses import StreamingResponse
 from app.security.jwt_auth import verify_jwt
 router = APIRouter()
+_BACKGROUND_TASKS: set[asyncio.Task[object]] = set()
 # Keep-alive interval for SSE when upstream nodes are still working.
 # Prevents edge/proxy idle timeouts on long retrieval/generation turns.
 # Phrases a visitor uses when telling the bot it gave a wrong answer.
 # Matched on the lowercased raw message before any LLM call — O(1), zero cost.
+_CRITICISM_SIGNALS: tuple[str, ...] = (
     "that's wrong", "thats wrong", "you're wrong", "youre wrong",
     "not right", "wrong answer", "you got it wrong", "that is wrong",
     "that's incorrect", "you're incorrect", "thats incorrect", "youre incorrect",
     "fix that", "fix your answer", "actually no", "no that's", "no thats",
     "that was wrong", "your answer was wrong", "wrong information",
     "incorrect information", "that's not right", "thats not right",
+)
 def _is_criticism(message: str) -> bool:
     if not cited_nums:
         return sources
+    valid_cited_nums = {num for num in cited_nums if 1 <= num <= len(sources)}
+    if not valid_cited_nums:
         return sources
+    return [s for i, s in enumerate(sources, start=1) if i in valid_cited_nums]
 async def _generate_follow_ups(
     previous_summary: str | None,
     query: str,
     answer: str,
 ) -> None:
     """
     Triggered post-response to update the rolling conversation summary.
             previous_summary=previous_summary or "",
             new_turn_q=query,
             new_turn_a=answer[:600],  # cap answer chars sent to Gemini
         )
         if new_summary:
             conv_store.set_summary(session_id, new_summary)
         pass
+def _track_background_task(task: asyncio.Task[object]) -> None:
+    _BACKGROUND_TASKS.add(task)
+    task.add_done_callback(_BACKGROUND_TASKS.discard)
 @router.post("")
 @chat_rate_limit()
 async def chat_endpoint(
     request: Request,
     request_data: ChatRequest,
+    token_payload: Annotated[dict, Depends(verify_jwt)],
 ) -> StreamingResponse:
     """Stream RAG answer as typed SSE events.
             # ── Follow-up questions ────────────────────────────────────────────
             # Generated after the done event so it never delays answer delivery.
             if final_answer and not await request.is_disconnected():
+                follow_up_task: asyncio.Task[list[str]] = asyncio.create_task(
+                    _generate_follow_ups(request_data.message, final_answer, final_sources, llm_client)
                 )
+                _track_background_task(follow_up_task)
+                try:
+                    follow_ups = await asyncio.wait_for(follow_up_task, timeout=0.25)
+                except Exception:
+                    follow_up_task.cancel()
+                    follow_ups = []
                 if follow_ups:
                     yield f"event: follow_ups\ndata: {json.dumps({'questions': follow_ups})}\n\n"
             # Stage 2: update rolling summary asynchronously — fired after the
             # response is fully delivered so it adds zero latency to the turn.
             if final_answer and gemini_client and gemini_client.is_configured:
+                summary_task: asyncio.Task[None] = asyncio.create_task(
                     _update_summary_async(
                         conv_store=conv_store,
                         gemini_client=gemini_client,
                         previous_summary=conversation_summary,
                         query=request_data.message,
                         answer=final_answer,
                     )
                 )
+                _track_background_task(summary_task)
         except Exception as exc:
             yield f"data: {json.dumps({'error': str(exc) or 'Generation failed'})}\n\n"

app/main.py CHANGED Viewed

@@ -68,6 +68,24 @@ def _normalize_qdrant_url(url: str) -> str:
     return raw
 async def _qdrant_keepalive_loop(
     qdrant: QdrantClient,
     interval_seconds: int,
@@ -111,15 +129,7 @@ async def lifespan(app: FastAPI):
     # DagsHub/MLflow experiment tracking — optional, only active when token is set.
     # In prod with DAGSHUB_TOKEN set, experiments are tracked at dagshub.com.
     # In local or test environments, MLflow is a no-op.
-    if settings.DAGSHUB_TOKEN:
-        import dagshub
-        dagshub.init(
-            repo_owner=settings.DAGSHUB_REPO.split("/")[0],
-            repo_name=settings.DAGSHUB_REPO.split("/")[1],
-            mlflow=True,
-            dvc=False,
-        )
-        logger.info("DagsHub MLflow tracking enabled | repo=%s", settings.DAGSHUB_REPO)
     embedder = Embedder(remote_url=settings.EMBEDDER_URL, environment=settings.ENVIRONMENT)
     reranker = Reranker(remote_url=settings.RERANKER_URL, environment=settings.ENVIRONMENT)

     return raw
+def _setup_dagshub_tracking(settings) -> None:
+    if not settings.DAGSHUB_TOKEN:
+        return
+    try:
+        import dagshub
+        dagshub.init(
+            repo_owner=settings.DAGSHUB_REPO.split("/")[0],
+            repo_name=settings.DAGSHUB_REPO.split("/")[1],
+            mlflow=True,
+            dvc=False,
+        )
+        logger.info("DagsHub MLflow tracking enabled | repo=%s", settings.DAGSHUB_REPO)
+    except Exception as exc:
+        logger.warning("DagsHub MLflow tracking disabled: %s", exc)
 async def _qdrant_keepalive_loop(
     qdrant: QdrantClient,
     interval_seconds: int,
     # DagsHub/MLflow experiment tracking — optional, only active when token is set.
     # In prod with DAGSHUB_TOKEN set, experiments are tracked at dagshub.com.
     # In local or test environments, MLflow is a no-op.
+    _setup_dagshub_tracking(settings)
     embedder = Embedder(remote_url=settings.EMBEDDER_URL, environment=settings.ENVIRONMENT)
     reranker = Reranker(remote_url=settings.RERANKER_URL, environment=settings.ENVIRONMENT)

app/pipeline/graph.py CHANGED Viewed

@@ -113,7 +113,7 @@ def build_pipeline(services: dict) -> CompiledStateGraph:
     graph.add_node("guard",          make_guard_node(services["classifier"]))
     graph.add_node("enumerate_query", make_enumerate_query_node(services["vector_store"]))
     graph.add_node("cache",          make_cache_node(services["cache"], services["embedder"]))
-    graph.add_node("gemini_fast",    make_gemini_fast_node(services["gemini"]))
     graph.add_node("retrieve",       make_retrieve_node(
                                      services["vector_store"],
                                      services["embedder"],

     graph.add_node("guard",          make_guard_node(services["classifier"]))
     graph.add_node("enumerate_query", make_enumerate_query_node(services["vector_store"]))
     graph.add_node("cache",          make_cache_node(services["cache"], services["embedder"]))
+    graph.add_node("gemini_fast",    make_gemini_fast_node())
     graph.add_node("retrieve",       make_retrieve_node(
                                      services["vector_store"],
                                      services["embedder"],

app/pipeline/nodes/cache.py CHANGED Viewed

@@ -29,7 +29,7 @@ from app.services.semantic_cache import SemanticCache
 # prior turn, and excluding them would bypass cache on most portfolio queries.
 _REFERENCE_TOKENS: frozenset[str] = frozenset({
     "that", "it", "its", "they", "their", "those",
-    "this", "these", "them", "there", "then",
 })
@@ -67,9 +67,20 @@ def make_cache_node(cache: SemanticCache, embedder) -> Callable[[PipelineState],
         cached = await cache.get(query_embedding)
         if cached:
             writer({"type": "status", "label": "Found a recent answer, loading..."})
-            # Emit the full cached answer as a single token event — the cache
-            # returns complete text, not a stream, so one event is correct.
-            writer({"type": "token", "text": cached})
             return {
                 "answer": cached,
                 "cached": True,

 # prior turn, and excluding them would bypass cache on most portfolio queries.
 _REFERENCE_TOKENS: frozenset[str] = frozenset({
     "that", "it", "its", "they", "their", "those",
+    "this", "these", "them",
 })
         cached = await cache.get(query_embedding)
         if cached:
             writer({"type": "status", "label": "Found a recent answer, loading..."})
+            # Stream cached answers in short chunks so the SSE contract stays
+            # consistent with non-cache paths.
+            words = cached.split()
+            chunk: list[str] = []
+            chunk_len = 0
+            for word in words:
+                chunk.append(word)
+                chunk_len += len(word) + 1
+                if chunk_len >= 80:
+                    writer({"type": "token", "text": " ".join(chunk)})
+                    chunk = []
+                    chunk_len = 0
+            if chunk:
+                writer({"type": "token", "text": " ".join(chunk)})
             return {
                 "answer": cached,
                 "cached": True,

app/pipeline/nodes/gemini_fast.py CHANGED Viewed

@@ -5,6 +5,7 @@ and citation-capable. No parametric Gemini answer generation is used here.
 """
 from __future__ import annotations
 import logging
 import re
 from typing import Any
@@ -12,7 +13,6 @@ from typing import Any
 from langgraph.config import get_stream_writer
 from app.models.pipeline import PipelineState
-from app.services.gemini_client import GeminiClient
 logger = logging.getLogger(__name__)
@@ -43,14 +43,17 @@ _SMALL_TALK_ANSWER = (
     "and I'll find the details for you."
 )
-def make_gemini_fast_node(gemini_client: GeminiClient) -> Any:
     """
     Returns a LangGraph-compatible async node function.
-    ``gemini_client`` is injected at startup from app.state.gemini_client.
     """
     async def gemini_fast(state: PipelineState) -> dict:
         writer = get_stream_writer()
         writer({"type": "status", "label": "Thinking about your question directly..."})
         query = state["query"]

 """
 from __future__ import annotations
+import asyncio
 import logging
 import re
 from typing import Any
 from langgraph.config import get_stream_writer
 from app.models.pipeline import PipelineState
 logger = logging.getLogger(__name__)
     "and I'll find the details for you."
 )
+def make_gemini_fast_node(gemini_client: Any | None = None) -> Any:
     """
     Returns a LangGraph-compatible async node function.
     """
+    configured = bool(gemini_client and getattr(gemini_client, "is_configured", False))
     async def gemini_fast(state: PipelineState) -> dict:
+        await asyncio.sleep(0)
         writer = get_stream_writer()
+        if configured:
+            logger.debug("Gemini client is configured, but fast-path remains deterministic.")
         writer({"type": "status", "label": "Thinking about your question directly..."})
         query = state["query"]

app/pipeline/nodes/generate.py CHANGED Viewed

@@ -336,9 +336,9 @@ def _build_low_trust_fallback(query: str, source_refs: list[SourceRef]) -> str:
     if _VERSION_PARITY_RE.search(query):
         return (
-            "The indexed sources include related details [1], but they do not explicitly "
             "confirm whether the GitHub code and live demo are currently in sync, so version parity "
-            "cannot be verified from indexed content alone [1]."
         )
     return (
@@ -369,14 +369,14 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
         # needed, just reliable numbered-list formatting with one citation per item.
         if state.get("is_enumeration_query") and reranked_chunks:
             writer({"type": "status", "label": "Formatting complete list..."})
-            context_parts: list[str] = []
             source_refs: list[SourceRef] = []
             for i, chunk in enumerate(reranked_chunks, start=1):
                 meta = chunk["metadata"]
                 header = f"[{i}] {meta.get('source_title', 'Item')}"
                 if meta.get("source_url"):
                     header += f" ({meta['source_url']})"
-                context_parts.append(f"{header}\n{chunk['text'][:300]}")
                 source_refs.append(
                     SourceRef(
                         title=meta.get("source_title", ""),
@@ -385,7 +385,7 @@ def make_generate_node(llm_client: LLMClient, gemini_client=None) -> Callable[[P
                         source_type=meta.get("source_type", ""),
                     )
                 )
-            context_block_enum = "\n\n".join(context_parts)
             prompt_enum = f"Items fetched from database:\n{context_block_enum}\n\nVisitor request: {query}"
             stream = llm_client.complete_with_complexity(
                 prompt=prompt_enum,

     if _VERSION_PARITY_RE.search(query):
         return (
+            "The indexed sources include related details, but they do not explicitly "
             "confirm whether the GitHub code and live demo are currently in sync, so version parity "
+            "cannot be verified from indexed content alone."
         )
     return (
         # needed, just reliable numbered-list formatting with one citation per item.
         if state.get("is_enumeration_query") and reranked_chunks:
             writer({"type": "status", "label": "Formatting complete list..."})
+            enum_context_parts: list[str] = []
             source_refs: list[SourceRef] = []
             for i, chunk in enumerate(reranked_chunks, start=1):
                 meta = chunk["metadata"]
                 header = f"[{i}] {meta.get('source_title', 'Item')}"
                 if meta.get("source_url"):
                     header += f" ({meta['source_url']})"
+                enum_context_parts.append(f"{header}\n{chunk['text'][:300]}")
                 source_refs.append(
                     SourceRef(
                         title=meta.get("source_title", ""),
                         source_type=meta.get("source_type", ""),
                     )
                 )
+            context_block_enum = "\n\n".join(enum_context_parts)
             prompt_enum = f"Items fetched from database:\n{context_block_enum}\n\nVisitor request: {query}"
             stream = llm_client.complete_with_complexity(
                 prompt=prompt_enum,

app/pipeline/nodes/guard.py CHANGED Viewed

@@ -37,7 +37,7 @@ def make_guard_node(classifier: GuardClassifier) -> Callable[[PipelineState], di
             }
         # 2. Classify (scope evaluation).
-        is_safe, score = classifier.is_in_scope(clean_query)
         if not is_safe:
             return {

             }
         # 2. Classify (scope evaluation).
+        is_safe, _ = classifier.is_in_scope(clean_query)
         if not is_safe:
             return {

app/pipeline/nodes/log_eval.py CHANGED Viewed

@@ -13,6 +13,56 @@ logger = logging.getLogger(__name__)
 _PENDING_TASKS: set[asyncio.Task[None]] = set()
 def _source_hit_proxy(state: PipelineState) -> int:
     reranked_chunks = state.get("reranked_chunks", [])
     chunk_count = len(reranked_chunks)
@@ -35,11 +85,9 @@ def make_log_eval_node(db_path: str) -> Callable[[PipelineState], dict]:
     only RAG interactions have chunk associations for valid training pairs.
     """
-    def _write_to_sqlite(state: PipelineState) -> int:
-        db_dir = os.path.dirname(db_path)
-        if db_dir:
-            os.makedirs(db_dir, exist_ok=True)
         chunks_used = json.dumps(
             [c["metadata"]["doc_id"] for c in state.get("reranked_chunks", [])]
         )
@@ -61,54 +109,6 @@ def make_log_eval_node(db_path: str) -> Callable[[PipelineState], dict]:
         source_hit_proxy = _source_hit_proxy(state)
         with sqlite3.connect(db_path) as conn:
-            conn.execute(
-                """
-                CREATE TABLE IF NOT EXISTS interactions (
-                    id                   INTEGER PRIMARY KEY AUTOINCREMENT,
-                    timestamp            TEXT,
-                    session_id           TEXT,
-                    query                TEXT,
-                    answer               TEXT,
-                    chunks_used          TEXT,
-                    rerank_scores        TEXT,
-                    reranked_chunks_json TEXT,
-                    latency_ms           INTEGER,
-                    cached               BOOLEAN,
-                    feedback             INTEGER DEFAULT 0,
-                    path                 TEXT DEFAULT 'rag',
-                    critic_groundedness  INTEGER,
-                    critic_completeness  INTEGER,
-                    critic_specificity   INTEGER,
-                    critic_quality       TEXT,
-                    is_enumeration_query BOOLEAN DEFAULT 0,
-                    source_hit_proxy     INTEGER DEFAULT 0
-                )
-                """
-            )
-            # Idempotent schema upgrades for deployments that pre-date these columns.
-            for col, definition in [
-                ("reranked_chunks_json", "TEXT DEFAULT '[]'"),
-                ("feedback", "INTEGER DEFAULT 0"),
-                ("session_id", "TEXT DEFAULT ''"),
-                # path column: old rows default to "rag" — they were all RAG interactions.
-                ("path", "TEXT DEFAULT 'rag'"),
-                # Stage 3 SELF-RAG critic scores
-                ("critic_groundedness", "INTEGER"),
-                ("critic_completeness", "INTEGER"),
-                ("critic_specificity", "INTEGER"),
-                ("critic_quality", "TEXT"),
-                # Fix 1: enumeration classifier flag
-                ("is_enumeration_query", "BOOLEAN DEFAULT 0"),
-                # RC-13: retrieval diagnostics
-                ("sibling_expansion_count", "INTEGER"),
-                ("focused_source_type", "TEXT"),
-                ("source_hit_proxy", "INTEGER DEFAULT 0"),
-            ]:
-                try:
-                    conn.execute(f"ALTER TABLE interactions ADD COLUMN {col} {definition}")
-                except sqlite3.OperationalError:
-                    pass  # Column already exists.
             cursor = conn.execute(
                 """
                 INSERT INTO interactions
@@ -142,6 +142,7 @@ def make_log_eval_node(db_path: str) -> Callable[[PipelineState], dict]:
             return cursor.lastrowid  # type: ignore[return-value]
     def _build_loki_record(state: PipelineState) -> dict:
         return {
             "timestamp": datetime.now(tz=timezone.utc).isoformat(),
             "session_id": state.get("session_id", ""),
@@ -164,7 +165,16 @@ def make_log_eval_node(db_path: str) -> Callable[[PipelineState], dict]:
             "is_followup": state.get("is_followup", False),
             "is_audio_mode": state.get("is_audio_mode", False),
             "follow_ups": state.get("follow_ups", []),
-            "reranked_chunks": state.get("reranked_chunks", []),
             "source_hit_proxy": _source_hit_proxy(state),
         }

 _PENDING_TASKS: set[asyncio.Task[None]] = set()
+def _ensure_interactions_schema(db_path: str) -> None:
+    db_dir = os.path.dirname(db_path)
+    if db_dir:
+        os.makedirs(db_dir, exist_ok=True)
+    with sqlite3.connect(db_path) as conn:
+        conn.execute(
+            """
+            CREATE TABLE IF NOT EXISTS interactions (
+                id                   INTEGER PRIMARY KEY AUTOINCREMENT,
+                timestamp            TEXT,
+                session_id           TEXT,
+                query                TEXT,
+                answer               TEXT,
+                chunks_used          TEXT,
+                rerank_scores        TEXT,
+                reranked_chunks_json TEXT,
+                latency_ms           INTEGER,
+                cached               BOOLEAN,
+                feedback             INTEGER DEFAULT 0,
+                path                 TEXT DEFAULT 'rag',
+                critic_groundedness  INTEGER,
+                critic_completeness  INTEGER,
+                critic_specificity   INTEGER,
+                critic_quality       TEXT,
+                is_enumeration_query BOOLEAN DEFAULT 0,
+                source_hit_proxy     INTEGER DEFAULT 0
+            )
+            """
+        )
+        for col, definition in [
+            ("reranked_chunks_json", "TEXT DEFAULT '[]'"),
+            ("feedback", "INTEGER DEFAULT 0"),
+            ("session_id", "TEXT DEFAULT ''"),
+            ("path", "TEXT DEFAULT 'rag'"),
+            ("critic_groundedness", "INTEGER"),
+            ("critic_completeness", "INTEGER"),
+            ("critic_specificity", "INTEGER"),
+            ("critic_quality", "TEXT"),
+            ("is_enumeration_query", "BOOLEAN DEFAULT 0"),
+            ("sibling_expansion_count", "INTEGER"),
+            ("focused_source_type", "TEXT"),
+            ("source_hit_proxy", "INTEGER DEFAULT 0"),
+        ]:
+            try:
+                conn.execute(f"ALTER TABLE interactions ADD COLUMN {col} {definition}")
+            except sqlite3.OperationalError:
+                pass
 def _source_hit_proxy(state: PipelineState) -> int:
     reranked_chunks = state.get("reranked_chunks", [])
     chunk_count = len(reranked_chunks)
     only RAG interactions have chunk associations for valid training pairs.
     """
+    _ensure_interactions_schema(db_path)
+    def _write_to_sqlite(state: PipelineState) -> int:
         chunks_used = json.dumps(
             [c["metadata"]["doc_id"] for c in state.get("reranked_chunks", [])]
         )
         source_hit_proxy = _source_hit_proxy(state)
         with sqlite3.connect(db_path) as conn:
             cursor = conn.execute(
                 """
                 INSERT INTO interactions
             return cursor.lastrowid  # type: ignore[return-value]
     def _build_loki_record(state: PipelineState) -> dict:
+        reranked_chunks = state.get("reranked_chunks", [])
         return {
             "timestamp": datetime.now(tz=timezone.utc).isoformat(),
             "session_id": state.get("session_id", ""),
             "is_followup": state.get("is_followup", False),
             "is_audio_mode": state.get("is_audio_mode", False),
             "follow_ups": state.get("follow_ups", []),
+            "chunk_count": len(reranked_chunks),
+            "top_chunk_doc_id": reranked_chunks[0]["metadata"].get("doc_id", "") if reranked_chunks else "",
+            "source_types_used": sorted(
+                {
+                    c["metadata"].get("source_type", "")
+                    for c in reranked_chunks
+                    if c["metadata"].get("source_type", "")
+                }
+            ),
+            "rerank_scores": [c["metadata"].get("rerank_score", 0.0) for c in reranked_chunks],
             "source_hit_proxy": _source_hit_proxy(state),
         }

app/pipeline/nodes/retrieve.py CHANGED Viewed

@@ -7,6 +7,7 @@ from langgraph.config import get_stream_writer
 logger = logging.getLogger(__name__)
 from app.models.pipeline import PipelineState, Chunk
 from app.services.vector_store import VectorStore
 from app.services.embedder import Embedder
@@ -97,29 +98,6 @@ _CAPABILITY_QUERY_HINTS: frozenset[str] = frozenset(
     }
 )
-_BIOGRAPHY_QUERY_HINTS: frozenset[str] = frozenset(
-    {
-        "work",
-        "experience",
-        "employment",
-        "career",
-        "internship",
-        "internships",
-        "education",
-        "degree",
-        "university",
-        "background",
-        "resume",
-        "cv",
-        "company",
-        "companies",
-        "role",
-        "roles",
-    }
-)
-_BIO_SOURCE_TYPES: frozenset[str] = frozenset({"resume", "cv", "bio"})
 _NORMALISATION_STOPWORDS: frozenset[str] = frozenset(
     {
         "tell",
@@ -200,34 +178,58 @@ _FOCUS_VOCAB: frozenset[str] = frozenset(
     }
 )
-def _edit_distance(a: str, b: str) -> int:
-    la, lb = len(a), len(b)
-    dp = list(range(lb + 1))
-    for i in range(1, la + 1):
-        prev = dp[0]
-        dp[0] = i
-        for j in range(1, lb + 1):
-            cur = dp[j]
-            cost = 0 if a[i - 1] == b[j - 1] else 1
-            dp[j] = min(dp[j] + 1, dp[j - 1] + 1, prev + cost)
-            prev = cur
-    return dp[lb]
 def _best_focus_replacement(token: str) -> str | None:
-    best = None
-    best_score = 99
-    for candidate in _FOCUS_VOCAB:
-        if token[0] != candidate[0]:
-            continue
         if abs(len(token) - len(candidate)) > 1:
             continue
-        score = _edit_distance(token, candidate)
-        if score <= 2 and score < best_score:
-            best_score = score
             best = candidate
-    return best
 def _normalise_focus_typos(query: str) -> str:
@@ -265,11 +267,6 @@ def _is_capability_query(query: str) -> bool:
     return bool(tokens & _CAPABILITY_QUERY_HINTS)
-def _is_biography_query(query: str) -> bool:
-    tokens = frozenset(re.findall(r"[a-z0-9]+", query.lower()))
-    return bool(tokens & _BIOGRAPHY_QUERY_HINTS)
 def _is_informative_chunk(chunk: Chunk) -> bool:
     """True when chunk text has enough lexical content for cross-encoder reranking."""
     text = (chunk.get("contextualised_text") or chunk["text"] or "").strip()
@@ -494,9 +491,9 @@ def make_retrieve_node(
         # chunks from the same source document via doc_id filter (no vector needed).
         # If chunk 4 of a blog post matched, chunks 1-3 and 5-6 are now candidates too.
         # This is the document-graph connectivity layer: doc_id is the edge linking chunks.
         if unique_chunks:
             sibling_fps: set[str] = {f"{c['metadata']['doc_id']}::{c['metadata']['section']}" for c in unique_chunks}
-            sibling_count = 0
             for seed in unique_chunks[:_SIBLING_EXPAND_TOP_N]:
                 if sibling_count >= _SIBLING_TOTAL_CAP:
                     break
@@ -533,23 +530,6 @@ def make_retrieve_node(
         if not rerank_candidates:
             rerank_candidates = unique_chunks
-        # Biography-focused queries should prioritize resume/CV evidence and avoid
-        # project/blog code passages crowding out personal background facts.
-        if _is_biography_query(retrieval_query):
-            bio_candidates = [
-                chunk
-                for chunk in rerank_candidates
-                if chunk["metadata"].get("source_type", "") in _BIO_SOURCE_TYPES
-            ]
-            if bio_candidates:
-                rerank_candidates = bio_candidates
-                writer(
-                    {
-                        "type": "status",
-                        "label": "Prioritizing resume and background sources...",
-                    }
-                )
         try:
             reranked = await reranker.rerank(retrieval_query, rerank_candidates, top_k=10)  # RC-5: raised from 7
         except (Exception, asyncio.CancelledError) as exc:
@@ -606,7 +586,9 @@ def make_retrieve_node(
                 "answer": "",
                 "retrieved_chunks": [],
                 "reranked_chunks": [],
-                "retrieval_attempts": attempts + 1,                "top_rerank_score": top_score,            }
         if rescue_low_confidence:
             writer(

 logger = logging.getLogger(__name__)
+from app.core.topic import _STOPWORDS
 from app.models.pipeline import PipelineState, Chunk
 from app.services.vector_store import VectorStore
 from app.services.embedder import Embedder
     }
 )
 _NORMALISATION_STOPWORDS: frozenset[str] = frozenset(
     {
         "tell",
     }
 )
+_FOCUS_VOCAB_BY_INITIAL: dict[str, tuple[str, ...]] = {}
+for candidate in _FOCUS_VOCAB:
+    initial = candidate[0]
+    existing = _FOCUS_VOCAB_BY_INITIAL.get(initial)
+    if existing is None:
+        _FOCUS_VOCAB_BY_INITIAL[initial] = (candidate,)
+    else:
+        _FOCUS_VOCAB_BY_INITIAL[initial] = existing + (candidate,)
+def _bounded_edit_distance(a: str, b: str, max_distance: int = 2) -> int:
+    if abs(len(a) - len(b)) > max_distance:
+        return max_distance + 1
+    previous_row = list(range(len(b) + 1))
+    for i, ca in enumerate(a, start=1):
+        current_row = [i]
+        row_min = current_row[0]
+        for j, cb in enumerate(b, start=1):
+            val = min(
+                current_row[j - 1] + 1,
+                previous_row[j] + 1,
+                previous_row[j - 1] + (0 if ca == cb else 1),
+            )
+            current_row.append(val)
+            if val < row_min:
+                row_min = val
+        if row_min > max_distance:
+            return max_distance + 1
+        previous_row = current_row
+    return previous_row[-1]
 def _best_focus_replacement(token: str) -> str | None:
+    candidates = _FOCUS_VOCAB_BY_INITIAL.get(token[0], ())
+    if not candidates:
+        return None
+    best: str | None = None
+    best_distance = 3
+    for candidate in candidates:
         if abs(len(token) - len(candidate)) > 1:
             continue
+        distance = _bounded_edit_distance(token, candidate, max_distance=2)
+        if distance < best_distance:
             best = candidate
+            best_distance = distance
+            if distance == 1:
+                break
+    return best if best_distance <= 2 else None
 def _normalise_focus_typos(query: str) -> str:
     return bool(tokens & _CAPABILITY_QUERY_HINTS)
 def _is_informative_chunk(chunk: Chunk) -> bool:
     """True when chunk text has enough lexical content for cross-encoder reranking."""
     text = (chunk.get("contextualised_text") or chunk["text"] or "").strip()
         # chunks from the same source document via doc_id filter (no vector needed).
         # If chunk 4 of a blog post matched, chunks 1-3 and 5-6 are now candidates too.
         # This is the document-graph connectivity layer: doc_id is the edge linking chunks.
+        sibling_count = 0
         if unique_chunks:
             sibling_fps: set[str] = {f"{c['metadata']['doc_id']}::{c['metadata']['section']}" for c in unique_chunks}
             for seed in unique_chunks[:_SIBLING_EXPAND_TOP_N]:
                 if sibling_count >= _SIBLING_TOTAL_CAP:
                     break
         if not rerank_candidates:
             rerank_candidates = unique_chunks
         try:
             reranked = await reranker.rerank(retrieval_query, rerank_candidates, top_k=10)  # RC-5: raised from 7
         except (Exception, asyncio.CancelledError) as exc:
                 "answer": "",
                 "retrieved_chunks": [],
                 "reranked_chunks": [],
+                "retrieval_attempts": attempts + 1,
+                "top_rerank_score": top_score,
+            }
         if rescue_low_confidence:
             writer(

app/services/loki_sink.py CHANGED Viewed

@@ -51,13 +51,25 @@ def _to_float_or_none(value: Any) -> float | None:
 def _build_sanitized_record(record: dict[str, Any]) -> dict[str, Any]:
-    reranked_chunks = record.get("reranked_chunks") or []
     query = str(record.get("query", ""))
     session_id = str(record.get("session_id", ""))
-    rerank_scores, source_types_used, chunk_count, top_chunk_doc_id = _extract_chunk_metrics(reranked_chunks)
     top_rerank_score = _to_float_or_none(record.get("top_rerank_score"))
-    source_hit_proxy = int(top_rerank_score is not None and top_rerank_score > -1.5 and chunk_count >= 2)
     return {
         "timestamp": str(record.get("timestamp", "")),
@@ -81,9 +93,9 @@ def _build_sanitized_record(record: dict[str, Any]) -> dict[str, Any]:
         "query_hash": _sha_prefix(query, 16) if query else "",
         "chunk_count": chunk_count,
         "top_chunk_doc_id": top_chunk_doc_id,
-        "source_types_used": source_types_used,
         "follow_up_count": len(record.get("follow_ups") or []),
-        "rerank_scores": rerank_scores,
         "source_hit_proxy": source_hit_proxy,
     }

 def _build_sanitized_record(record: dict[str, Any]) -> dict[str, Any]:
     query = str(record.get("query", ""))
     session_id = str(record.get("session_id", ""))
+    rerank_scores = record.get("rerank_scores") or []
+    source_types_used = record.get("source_types_used") or []
+    chunk_count = int(record.get("chunk_count", 0) or 0)
+    top_chunk_doc_id = str(record.get("top_chunk_doc_id", "") or "")
+    if not rerank_scores and record.get("reranked_chunks"):
+        rerank_scores, source_types_used, chunk_count, top_chunk_doc_id = _extract_chunk_metrics(
+            record.get("reranked_chunks") or []
+        )
     top_rerank_score = _to_float_or_none(record.get("top_rerank_score"))
+    source_hit_proxy = int(
+        record.get("source_hit_proxy")
+        if record.get("source_hit_proxy") is not None
+        else (top_rerank_score is not None and top_rerank_score > -1.5 and chunk_count >= 2)
+    )
     return {
         "timestamp": str(record.get("timestamp", "")),
         "query_hash": _sha_prefix(query, 16) if query else "",
         "chunk_count": chunk_count,
         "top_chunk_doc_id": top_chunk_doc_id,
+        "source_types_used": sorted(str(source_type) for source_type in source_types_used if str(source_type)),
         "follow_up_count": len(record.get("follow_ups") or []),
+        "rerank_scores": [float(score) for score in rerank_scores if isinstance(score, (int, float))],
         "source_hit_proxy": source_hit_proxy,
     }

tests/test_cache_reference_tokens.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from app.pipeline.nodes.cache import _has_unresolved_reference
+def test_has_unresolved_reference_ignores_spatial_and_temporal_words() -> None:
+    assert _has_unresolved_reference("tell me about there") is False
+    assert _has_unresolved_reference("what happened then") is False
+def test_has_unresolved_reference_detects_pronouns_and_demonstratives() -> None:
+    assert _has_unresolved_reference("tell me about this") is True
+    assert _has_unresolved_reference("what is that project") is True

tests/test_chat_source_filtering.py CHANGED Viewed

@@ -35,3 +35,15 @@ def test_filter_sources_by_citations_no_citations_returns_input() -> None:
     filtered = _filter_sources_by_citations(answer, sources)
     assert filtered == sources

     filtered = _filter_sources_by_citations(answer, sources)
     assert filtered == sources
+def test_filter_sources_by_citations_discards_only_out_of_range_sources() -> None:
+    sources = [
+        {"title": "A"},
+        {"title": "B"},
+    ]
+    answer = "Valid [1], missing [9]."
+    filtered = _filter_sources_by_citations(answer, sources)
+    assert [s["title"] for s in filtered] == ["A"]

tests/test_generate_quality_fallback.py CHANGED Viewed

@@ -18,7 +18,7 @@ def test_low_trust_fallback_for_version_parity_queries() -> None:
     )
     assert "cannot be verified" in answer
-    assert "[1]" in answer
 def test_low_trust_fallback_general_query_is_concise() -> None:

     )
     assert "cannot be verified" in answer
+    assert "[1]" not in answer
 def test_low_trust_fallback_general_query_is_concise() -> None:

tests/test_log_eval_privacy.py CHANGED Viewed

@@ -1,9 +1,10 @@
 import json
 import sqlite3
 import pytest
-from app.pipeline.nodes.log_eval import make_log_eval_node
 def test_log_eval_stores_chunk_metadata_without_text(tmp_path) -> None:
@@ -42,3 +43,49 @@ def test_log_eval_stores_chunk_metadata_without_text(tmp_path) -> None:
     assert payload and payload[0]["doc_id"] == "resume-rag"
     assert payload[0]["source_title"] == "Resume"
     assert "text" not in payload[0]

+import asyncio
 import json
 import sqlite3
 import pytest
+from app.pipeline.nodes.log_eval import _PENDING_TASKS, make_log_eval_node
 def test_log_eval_stores_chunk_metadata_without_text(tmp_path) -> None:
     assert payload and payload[0]["doc_id"] == "resume-rag"
     assert payload[0]["source_title"] == "Resume"
     assert "text" not in payload[0]
+@pytest.mark.asyncio
+async def test_log_eval_sends_sanitized_loki_payload(monkeypatch, tmp_path) -> None:
+    db_path = str(tmp_path / "interactions.db")
+    node = make_log_eval_node(db_path)
+    captured: dict = {}
+    async def _fake_ship_to_loki(record: dict) -> None:
+        await asyncio.sleep(0)
+        captured["record"] = record
+    monkeypatch.setattr("app.pipeline.nodes.log_eval.ship_to_loki", _fake_ship_to_loki)
+    node(
+        {
+            "session_id": "s1",
+            "query": "What work experience does Darshan have?",
+            "answer": "He worked at VK Live.",
+            "reranked_chunks": [
+                {
+                    "text": "Phone +44 7818 975908 and email someone@example.com",
+                    "metadata": {
+                        "doc_id": "resume-rag",
+                        "source_title": "Resume",
+                        "source_type": "resume",
+                        "section": "Work Experience",
+                        "rerank_score": 0.9,
+                    },
+                }
+            ],
+            "latency_ms": 123,
+            "cached": False,
+            "path": "rag",
+            "is_enumeration_query": False,
+            "top_rerank_score": 0.9,
+        }
+    )
+    await asyncio.gather(*list(_PENDING_TASKS))
+    record = captured.get("record")
+    assert record is not None
+    assert "reranked_chunks" not in record
+    assert record["chunk_count"] == 1
+    assert record["top_chunk_doc_id"] == "resume-rag"