Spaces:

07Codex07
/

PrepGraph-Backend

Running

App Files Files Community

07Codex07 commited on Nov 23, 2025

Commit

41d23d8

1 Parent(s): 7fdfc47

changed the context and retrieval

Browse files

Files changed (2) hide show

chatbot_retriever.py +63 -9
main_api.py +39 -5

chatbot_retriever.py CHANGED Viewed

@@ -100,12 +100,13 @@ def ensure_data_dir():
     ]
     local_paths = []
     for f in files:
         dest_path = os.path.join(data_dir, f)          # ✅ keep real folder structure
         os.makedirs(os.path.dirname(dest_path), exist_ok=True)
         if not os.path.exists(dest_path):
-            print(f"📥 Downloading {f} from Hugging Face (public dataset)...")
             downloaded = hf_hub_download(
                 repo_id=DATASET_REPO,
                 filename=f,
@@ -113,13 +114,17 @@ def ensure_data_dir():
                 force_download=True,
             )
             shutil.copy(downloaded, dest_path)          # ✅ copy instead of rename (works inside HF Spaces)
         local_paths.append(dest_path)
-    # Debug info for verification
-    print(f"✅ Total files ensured: {len(local_paths)}")
-    for p in local_paths[:3]:
-        print(f" → {p}")
     return local_paths
@@ -206,14 +211,24 @@ def load_all_docs(base_dir: str = DATA_DIR) -> List:
 # ---------- Build / load FAISS + BM25 ----------
 def build_or_load_indexes(force_reindex: bool = False):
     if os.getenv("FORCE_REINDEX", "0").lower() in ("1", "true", "yes"):
         force_reindex = True
-    ensure_data_dir()
     docs = load_all_docs(DATA_DIR)
     if not docs:
-        logger.warning("No documents found. Returning empty indexes.")
         return [], None, [], [], None
     # chunking
     if os.path.exists(CHUNKS_CACHE) and not force_reindex:
@@ -362,9 +377,12 @@ def build_or_load_indexes(force_reindex: bool = False):
 # ---------- Hybrid retrieve ----------
 def _ensure_index_built():
     if not hasattr(hybrid_retrieve, "_index_built") or not hybrid_retrieve._index_built:
         hybrid_retrieve._chunks, hybrid_retrieve._bm25, hybrid_retrieve._tokenized, hybrid_retrieve._corpus, hybrid_retrieve._faiss = build_or_load_indexes()
         hybrid_retrieve._index_built = True
 def _faiss_search(query: str, top_k: int = TOP_K_DOCS, subject: Optional[str] = None):
@@ -408,12 +426,19 @@ def _faiss_search(query: str, top_k: int = TOP_K_DOCS, subject: Optional[str] =
 def hybrid_retrieve(query: str, subject: Optional[str] = None, top_k: int = TOP_K_DOCS, max_chars: int = MAX_CONTEXT_CHARS) -> Dict[str, Any]:
     if not query:
         return {"context": None, "bm25_docs": [], "faiss_docs": [], "meta": []}
     _ensure_index_built()
     chunks = hybrid_retrieve._chunks
     bm25 = hybrid_retrieve._bm25
     # BM25
     results_bm25 = []
@@ -423,7 +448,11 @@ def hybrid_retrieve(query: str, subject: Optional[str] = None, top_k: int = TOP_
             scores = bm25.get_scores(q_tokens)
             ranked_idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
             for i in ranked_idx:
-                results_bm25.append((float(scores[i]), chunks[i].metadata, chunks[i].page_content))
     except Exception:
         logger.exception("BM25 search failed")
@@ -431,6 +460,7 @@ def hybrid_retrieve(query: str, subject: Optional[str] = None, top_k: int = TOP_
     results_faiss = []
     try:
         results_faiss = _faiss_search(query, top_k=top_k, subject=subject)
     except Exception:
         logger.exception("FAISS search failed")
@@ -458,16 +488,26 @@ def hybrid_retrieve(query: str, subject: Optional[str] = None, top_k: int = TOP_
     # compose context parts with headers
     context_parts = []
     for i, t in enumerate(merged_texts):
         header = f"\n\n===== DOC {i+1} =====\n"
         context_parts.append(header + t)
     context = "\n".join(context_parts).strip()
     if not context:
         return {"context": None, "bm25_docs": results_bm25, "faiss_docs": results_faiss, "meta": merged_meta}
     if len(context) > max_chars:
         context = context[:max_chars].rstrip() + "..."
     return {"context": context, "bm25_docs": results_bm25, "faiss_docs": results_faiss, "meta": merged_meta}
@@ -477,14 +517,28 @@ def _last_n_user_messages(rows: List[tuple], n: int = 1) -> List[str]:
     return users[-1:]  # always return ONLY the latest user query # only keep the last one
 def retrieve_node_from_rows(rows: List[tuple], top_k: int = TOP_K_DOCS) -> Dict[str, Any]:
     last_users = _last_n_user_messages(rows, n=1)
     current_query = " ".join(last_users).strip() if last_users else ""
     if not current_query:
         return {"context": None, "direct": False}
     detected = None
     try:
         detected = detect_subject(current_query)
     except Exception:
         detected = None
     result = hybrid_retrieve(current_query, subject=detected, top_k=top_k, max_chars=MAX_CONTEXT_CHARS)
-    return {"context": result.get("context"), "direct": False}

     ]
     local_paths = []
+    downloaded_count = 0
     for f in files:
         dest_path = os.path.join(data_dir, f)          # ✅ keep real folder structure
         os.makedirs(os.path.dirname(dest_path), exist_ok=True)
         if not os.path.exists(dest_path):
+            logger.info(f"📥 Downloading {f} from Hugging Face (public dataset)...")
             downloaded = hf_hub_download(
                 repo_id=DATASET_REPO,
                 filename=f,
                 force_download=True,
             )
             shutil.copy(downloaded, dest_path)          # ✅ copy instead of rename (works inside HF Spaces)
+            downloaded_count += 1
         local_paths.append(dest_path)
+    # Only print summary if files were actually downloaded
+    if downloaded_count > 0:
+        logger.info(f"✅ Downloaded {downloaded_count} new file(s). Total files ensured: {len(local_paths)}")
+        for p in local_paths[:3]:
+            logger.debug(f" → {p}")
+    else:
+        logger.debug(f"✅ All {len(local_paths)} data files already exist")
     return local_paths
 # ---------- Build / load FAISS + BM25 ----------
 def build_or_load_indexes(force_reindex: bool = False):
+    """Build or load FAISS and BM25 indexes. Returns (chunks, bm25, tokenized, corpus_texts, faiss_data)."""
     if os.getenv("FORCE_REINDEX", "0").lower() in ("1", "true", "yes"):
         force_reindex = True
+    # Only ensure data dir if files don't exist (check a sample file to avoid repeated calls)
+    sample_file = os.path.join(DATA_DIR, "cn.pdf")
+    if not os.path.exists(sample_file) or force_reindex:
+        logger.info("Data files missing or force_reindex=True, ensuring data directory...")
+        ensure_data_dir()
+    else:
+        logger.debug("Data files already exist, skipping ensure_data_dir()")
     docs = load_all_docs(DATA_DIR)
     if not docs:
+        logger.warning("No documents found in %s. Returning empty indexes.", DATA_DIR)
         return [], None, [], [], None
+    logger.info("Loaded %d document pages from %s", len(docs), DATA_DIR)
     # chunking
     if os.path.exists(CHUNKS_CACHE) and not force_reindex:
 # ---------- Hybrid retrieve ----------
 def _ensure_index_built():
+    """Ensure indexes are built. Only rebuilds if not already initialized."""
     if not hasattr(hybrid_retrieve, "_index_built") or not hybrid_retrieve._index_built:
+        logger.info("Initializing indexes for hybrid_retrieve...")
         hybrid_retrieve._chunks, hybrid_retrieve._bm25, hybrid_retrieve._tokenized, hybrid_retrieve._corpus, hybrid_retrieve._faiss = build_or_load_indexes()
         hybrid_retrieve._index_built = True
+        logger.info("Indexes initialized: %d chunks available", len(hybrid_retrieve._chunks) if hybrid_retrieve._chunks else 0)
 def _faiss_search(query: str, top_k: int = TOP_K_DOCS, subject: Optional[str] = None):
 def hybrid_retrieve(query: str, subject: Optional[str] = None, top_k: int = TOP_K_DOCS, max_chars: int = MAX_CONTEXT_CHARS) -> Dict[str, Any]:
     if not query:
+        logger.warning("hybrid_retrieve called with empty query")
         return {"context": None, "bm25_docs": [], "faiss_docs": [], "meta": []}
     _ensure_index_built()
     chunks = hybrid_retrieve._chunks
     bm25 = hybrid_retrieve._bm25
+    if not chunks:
+        logger.error("No chunks available for retrieval. Indexes may not be built correctly.")
+        return {"context": None, "bm25_docs": [], "faiss_docs": [], "meta": []}
+    logger.debug("Retrieving for query: %s (top_k=%d)", query[:50], top_k)
     # BM25
     results_bm25 = []
             scores = bm25.get_scores(q_tokens)
             ranked_idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
             for i in ranked_idx:
+                if i < len(chunks):
+                    results_bm25.append((float(scores[i]), chunks[i].metadata, chunks[i].page_content))
+            logger.debug("BM25 found %d results", len(results_bm25))
+        else:
+            logger.warning("BM25 index is None")
     except Exception:
         logger.exception("BM25 search failed")
     results_faiss = []
     try:
         results_faiss = _faiss_search(query, top_k=top_k, subject=subject)
+        logger.debug("FAISS found %d results", len(results_faiss))
     except Exception:
         logger.exception("FAISS search failed")
     # compose context parts with headers
     context_parts = []
+    seen_texts = set()  # Deduplicate by text content
     for i, t in enumerate(merged_texts):
+        # Deduplicate: skip if we've seen this text before
+        if t in seen_texts:
+            continue
+        seen_texts.add(t)
         header = f"\n\n===== DOC {i+1} =====\n"
         context_parts.append(header + t)
     context = "\n".join(context_parts).strip()
     if not context:
+        logger.warning("No context generated from retrieval for query: %s (BM25: %d, FAISS: %d results)",
+                      query[:50], len(results_bm25), len(results_faiss))
         return {"context": None, "bm25_docs": results_bm25, "faiss_docs": results_faiss, "meta": merged_meta}
     if len(context) > max_chars:
         context = context[:max_chars].rstrip() + "..."
+        logger.debug("Context truncated from %d to %d characters", len("\n".join(context_parts)), max_chars)
+    logger.info("Retrieved context: %d characters from %d documents", len(context), len(context_parts))
     return {"context": context, "bm25_docs": results_bm25, "faiss_docs": results_faiss, "meta": merged_meta}
     return users[-1:]  # always return ONLY the latest user query # only keep the last one
 def retrieve_node_from_rows(rows: List[tuple], top_k: int = TOP_K_DOCS) -> Dict[str, Any]:
+    """Retrieve context from documents based on the last user message in rows."""
     last_users = _last_n_user_messages(rows, n=1)
     current_query = " ".join(last_users).strip() if last_users else ""
     if not current_query:
+        logger.warning("retrieve_node_from_rows: No user query found in rows")
         return {"context": None, "direct": False}
+    logger.debug("retrieve_node_from_rows: Query='%s'", current_query[:50])
     detected = None
     try:
         detected = detect_subject(current_query)
+        if detected:
+            logger.debug("Detected subject: %s", detected)
     except Exception:
         detected = None
     result = hybrid_retrieve(current_query, subject=detected, top_k=top_k, max_chars=MAX_CONTEXT_CHARS)
+    context = result.get("context")
+    if context:
+        logger.info("retrieve_node_from_rows: Successfully retrieved %d characters of context", len(context))
+    else:
+        logger.warning("retrieve_node_from_rows: No context retrieved for query: %s", current_query[:50])
+    return {"context": context, "direct": False}

main_api.py CHANGED Viewed

@@ -16,10 +16,8 @@ from memory_store import init_db, save_message, get_last_messages, clear_user_me
 from chatbot_retriever import build_or_load_indexes, hybrid_retrieve, retrieve_node_from_rows, load_all_docs, ensure_data_dir  # :contentReference[oaicite:5]{index=5}
 from chatbot_graph import SYSTEM_PROMPT, call_llm, _extract_answer_from_response  # :contentReference[oaicite:6]{index=6}
-ensure_data_dir()
 # ----------------- CORS SETUP -----------------
-from fastapi.middleware.cors import CORSMiddleware
-ensure_data_dir()
 app = FastAPI(title="RAG Chat Backend", version="1.0")
 from fastapi.middleware.cors import CORSMiddleware
@@ -73,13 +71,42 @@ class RetrieveResponse(BaseModel):
 def ensure_indexes(force_reindex: bool = False):
     """
     Build or load indexes synchronously. This wraps build_or_load_indexes from chatbot_retriever.
     """
     if INDEXES["built"] and not force_reindex:
         return INDEXES["info"]
     try:
         chunks, bm25, tokenized, corpus_texts, faiss_data = build_or_load_indexes(force_reindex=force_reindex)
         INDEXES["built"] = True
         INDEXES["info"] = {"chunks_len": len(chunks) if chunks else 0, "corpus_len": len(corpus_texts) if corpus_texts else 0}
         return INDEXES["info"]
     except Exception:
         logger.exception("Index build/load failed")
@@ -221,8 +248,12 @@ def chat(req: ChatRequest):
         try:
             retrieved = retrieve_node_from_rows(rows)
             context = retrieved.get("context")
-        except Exception:
-            logger.exception("retriever call failed")
             context = None
         # 5) build system prompt content
@@ -242,6 +273,9 @@ def chat(req: ChatRequest):
         system_content = SYSTEM_PROMPT
         if trimmed_context:
             system_content += "\n\n===== RETRIEVED CONTEXT =====\n" + trimmed_context
         # build prompt messages as list of simple dicts (call_llm expects same message format as in chatbot_graph)
         # chatbot_graph.call_llm expects langchain messages (SystemMessage/HumanMessage) — we built that in original file.
         # create messages as minimal objects that call_llm can accept (we rely on original call_llm).

 from chatbot_retriever import build_or_load_indexes, hybrid_retrieve, retrieve_node_from_rows, load_all_docs, ensure_data_dir  # :contentReference[oaicite:5]{index=5}
 from chatbot_graph import SYSTEM_PROMPT, call_llm, _extract_answer_from_response  # :contentReference[oaicite:6]{index=6}
 # ----------------- CORS SETUP -----------------
+from fastapi.middleware.cors import CORSMiddleware
 app = FastAPI(title="RAG Chat Backend", version="1.0")
 from fastapi.middleware.cors import CORSMiddleware
 def ensure_indexes(force_reindex: bool = False):
     """
     Build or load indexes synchronously. This wraps build_or_load_indexes from chatbot_retriever.
+    Also initializes hybrid_retrieve module variables to avoid reindexing.
     """
+    # Check if hybrid_retrieve already has indexes built (avoid duplicate work)
+    if hasattr(hybrid_retrieve, "_index_built") and hybrid_retrieve._index_built and not force_reindex:
+        if INDEXES["built"]:
+            return INDEXES["info"]
     if INDEXES["built"] and not force_reindex:
+        # Ensure hybrid_retrieve module variables are also set
+        if not hasattr(hybrid_retrieve, "_index_built") or not hybrid_retrieve._index_built:
+            # Indexes exist but hybrid_retrieve wasn't initialized, reload them
+            try:
+                chunks, bm25, tokenized, corpus_texts, faiss_data = build_or_load_indexes(force_reindex=False)
+                hybrid_retrieve._chunks = chunks
+                hybrid_retrieve._bm25 = bm25
+                hybrid_retrieve._tokenized = tokenized
+                hybrid_retrieve._corpus = corpus_texts
+                hybrid_retrieve._faiss = faiss_data
+                hybrid_retrieve._index_built = True
+                logger.info("Initialized hybrid_retrieve module variables from existing indexes")
+            except Exception:
+                logger.exception("Failed to initialize hybrid_retrieve variables")
         return INDEXES["info"]
     try:
         chunks, bm25, tokenized, corpus_texts, faiss_data = build_or_load_indexes(force_reindex=force_reindex)
+        # Set module-level variables in hybrid_retrieve to avoid rebuilding
+        hybrid_retrieve._chunks = chunks
+        hybrid_retrieve._bm25 = bm25
+        hybrid_retrieve._tokenized = tokenized
+        hybrid_retrieve._corpus = corpus_texts
+        hybrid_retrieve._faiss = faiss_data
+        hybrid_retrieve._index_built = True
         INDEXES["built"] = True
         INDEXES["info"] = {"chunks_len": len(chunks) if chunks else 0, "corpus_len": len(corpus_texts) if corpus_texts else 0}
+        logger.info("Indexes built/loaded: %d chunks, %d corpus texts", INDEXES["info"]["chunks_len"], INDEXES["info"]["corpus_len"])
         return INDEXES["info"]
     except Exception:
         logger.exception("Index build/load failed")
         try:
             retrieved = retrieve_node_from_rows(rows)
             context = retrieved.get("context")
+            if context:
+                logger.info("Retrieved context: %d characters from documents", len(context))
+            else:
+                logger.warning("Retriever returned empty context for query: %s", req.message)
+        except Exception as e:
+            logger.exception("retriever call failed: %s", e)
             context = None
         # 5) build system prompt content
         system_content = SYSTEM_PROMPT
         if trimmed_context:
             system_content += "\n\n===== RETRIEVED CONTEXT =====\n" + trimmed_context
+            logger.debug("Added context to system prompt: %d characters", len(trimmed_context))
+        else:
+            logger.warning("No context to add to system prompt for query: %s", req.message)
         # build prompt messages as list of simple dicts (call_llm expects same message format as in chatbot_graph)
         # chatbot_graph.call_llm expects langchain messages (SystemMessage/HumanMessage) — we built that in original file.
         # create messages as minimal objects that call_llm can accept (we rely on original call_llm).