Spaces:

NavyDevilDoc
/

AI_Toolkit

Sleeping

App Files Files Community

NavyDevilDoc commited on Dec 11, 2025

Commit

7841205

verified ·

1 Parent(s): c88e290

Update src/rag_engine.py

Browse files

Files changed (1) hide show

src/rag_engine.py +51 -52

src/rag_engine.py CHANGED Viewed

@@ -3,7 +3,7 @@ from langchain_chroma import Chroma
 from langchain_huggingface import HuggingFaceEmbeddings
 from sentence_transformers import CrossEncoder
 from core.ChunkingManager import ChunkingManager, ChunkingStrategy
-import tracker # To trigger syncs
 # --- CONFIGURATION ---
 UPLOAD_DIR = "/tmp/rag_uploads"
@@ -12,14 +12,11 @@ EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
 RERANKER_MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
 # --- LAZY LOADING SINGLETONS ---
-# We use these globals to store the models once loaded, so we don't reload them
-# every time a function is called, but we also don't load them on import.
 _embedding_fn = None
 _reranker = None
 _chunk_manager = None
 def get_embedding_function():
-    """Lazy loads the embedding model only when needed."""
     global _embedding_fn
     if _embedding_fn is None:
         print("⚙️ Loading Embedding Model...")
@@ -27,7 +24,6 @@ def get_embedding_function():
     return _embedding_fn
 def get_reranker_model():
-    """Lazy loads the CrossEncoder only when needed."""
     global _reranker
     if _reranker is None:
         print("⚙️ Loading Reranker Model...")
@@ -35,7 +31,6 @@ def get_reranker_model():
     return _reranker
 def get_chunk_manager():
-    """Lazy loads the Chunking Manager."""
     global _chunk_manager
     if _chunk_manager is None:
         print("⚙️ Loading Chunk Manager...")
@@ -44,8 +39,6 @@ def get_chunk_manager():
 # --- DATABASE OPERATIONS ---
 def get_vectorstore(username):
-    """Returns the persistent ChromaDB for a SPECIFIC USER."""
-    # Safety: Ensure username doesn't contain path traversal characters
     safe_username = os.path.basename(username)
     user_db_path = os.path.join(DB_ROOT, safe_username)
@@ -59,14 +52,10 @@ def get_vectorstore(username):
     )
 def save_uploaded_file(uploaded_file):
-    """Saves upload to temp, sanitizing the filename."""
     if not os.path.exists(UPLOAD_DIR):
         os.makedirs(UPLOAD_DIR)
-    # SECURITY FIX: Sanitize filename to prevent directory traversal
     safe_filename = os.path.basename(uploaded_file.name)
     file_path = os.path.join(UPLOAD_DIR, safe_filename)
     with open(file_path, "wb") as f:
         f.write(uploaded_file.getbuffer())
     return file_path
@@ -82,7 +71,6 @@ def process_and_add_document(file_path, username, strategy="paragraph"):
         }
         selected_strategy = strat_map.get(strategy, ChunkingStrategy.PARAGRAPH)
-        # Use the lazy-loaded chunk manager
         manager = get_chunk_manager()
         chunks = manager.process_document(
             file_path=file_path,
@@ -93,11 +81,14 @@ def process_and_add_document(file_path, username, strategy="paragraph"):
         if not chunks:
             return False, "No text extracted. Is the file empty/scanned?"
         print(f"💾 Indexing {len(chunks)} chunks into Vector DB...")
         db = get_vectorstore(username)
         db.add_documents(chunks)
-        # Sync immediately
         tracker.upload_user_db(username)
         if os.path.exists(file_path):
@@ -110,50 +101,56 @@ def process_and_add_document(file_path, username, strategy="paragraph"):
         return False, str(e)
 # --- RETRIEVAL ENGINE ---
-def search_knowledge_base(query, username, k=10):
     """
     Two-Stage Retrieval System (RAG):
     1. Retrieval: Get 10 candidates via fast Vector Search.
     2. Reranking: Sort them via Cross-Encoder (Slow/Precise).
     3. Return top k.
     """
-    db = get_vectorstore(username)
-    reranker = get_reranker_model()
-    # 1. Broad Search (Retrieve more than needed to filter later)
-    results = db.similarity_search(query, k=10)
-    if not results:
-        return []
-    # 2. Reranking
-    # Prepare pairs: [[Query, Text1], [Query, Text2]...]
-    passages = [doc.page_content for doc in results]
-    ranks = reranker.rank(query, passages)
-    # 3. Sort and Filter
-    # Reranker returns list of dicts: {'corpus_id': 0, 'score': 0.9}
-    top_results = []
-    # Sort ranks by score descending just to be safe (though .rank() usually sorts)
-    sorted_ranks = sorted(ranks, key=lambda x: x['score'], reverse=True)
-    for rank in sorted_ranks[:k]:
-        doc_index = rank['corpus_id']
-        doc = results[doc_index]
-        # Append score for transparency
-        doc.metadata["relevance_score"] = round(rank['score'], 4)
-        top_results.append(doc)
-    return top_results
 def list_documents(username):
-    """
-    Returns a list of unique files currently in the user's database.
-    WARNING: This pulls all metadata. Performance degrades >10k chunks.
-    """
     try:
         db = get_vectorstore(username)
         data = db.get()
         metadatas = data['metadatas']
@@ -162,9 +159,16 @@ def list_documents(username):
         for meta in metadatas:
             src = meta.get('source', 'unknown')
             filename = os.path.basename(src)
             if src not in file_stats:
-                file_stats[src] = {'source': src, 'filename': filename, 'chunks': 0}
             file_stats[src]['chunks'] += 1
         return list(file_stats.values())
@@ -174,21 +178,16 @@ def list_documents(username):
         return []
 def delete_document(username, source_path):
-    """Removes all chunks associated with a specific source file."""
     try:
         print(f"🗑️ Deleting {source_path} for {username}...")
         db = get_vectorstore(username)
         db.delete(where={"source": source_path})
         tracker.upload_user_db(username)
         return True, f"Deleted {os.path.basename(source_path)}"
     except Exception as e:
         return False, str(e)
 def reset_knowledge_base(username):
-    """Nuke option: Clears the entire database for the user."""
     try:
         db = get_vectorstore(username)
         db.delete_collection()

 from langchain_huggingface import HuggingFaceEmbeddings
 from sentence_transformers import CrossEncoder
 from core.ChunkingManager import ChunkingManager, ChunkingStrategy
+import tracker
 # --- CONFIGURATION ---
 UPLOAD_DIR = "/tmp/rag_uploads"
 RERANKER_MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
 # --- LAZY LOADING SINGLETONS ---
 _embedding_fn = None
 _reranker = None
 _chunk_manager = None
 def get_embedding_function():
     global _embedding_fn
     if _embedding_fn is None:
         print("⚙️ Loading Embedding Model...")
     return _embedding_fn
 def get_reranker_model():
     global _reranker
     if _reranker is None:
         print("⚙️ Loading Reranker Model...")
     return _reranker
 def get_chunk_manager():
     global _chunk_manager
     if _chunk_manager is None:
         print("⚙️ Loading Chunk Manager...")
 # --- DATABASE OPERATIONS ---
 def get_vectorstore(username):
     safe_username = os.path.basename(username)
     user_db_path = os.path.join(DB_ROOT, safe_username)
     )
 def save_uploaded_file(uploaded_file):
     if not os.path.exists(UPLOAD_DIR):
         os.makedirs(UPLOAD_DIR)
     safe_filename = os.path.basename(uploaded_file.name)
     file_path = os.path.join(UPLOAD_DIR, safe_filename)
     with open(file_path, "wb") as f:
         f.write(uploaded_file.getbuffer())
     return file_path
         }
         selected_strategy = strat_map.get(strategy, ChunkingStrategy.PARAGRAPH)
         manager = get_chunk_manager()
         chunks = manager.process_document(
             file_path=file_path,
         if not chunks:
             return False, "No text extracted. Is the file empty/scanned?"
+        # FIX #1: Tag every chunk with the strategy used
+        for chunk in chunks:
+            chunk.metadata["strategy"] = strategy
         print(f"💾 Indexing {len(chunks)} chunks into Vector DB...")
         db = get_vectorstore(username)
         db.add_documents(chunks)
         tracker.upload_user_db(username)
         if os.path.exists(file_path):
         return False, str(e)
 # --- RETRIEVAL ENGINE ---
+def search_knowledge_base(query, username, k=3):
     """
     Two-Stage Retrieval System (RAG):
     1. Retrieval: Get 10 candidates via fast Vector Search.
     2. Reranking: Sort them via Cross-Encoder (Slow/Precise).
     3. Return top k.
     """
+    try:
+        db = get_vectorstore(username)
+        # FIX #3: Graceful handling for empty/missing DB
+        # If the collection is empty, Chroma sometimes throws an error or returns nothing.
+        # We check count first to be safe.
+        if db._collection.count() == 0:
+            return []
+        reranker = get_reranker_model()
+        # 1. Broad Search
+        results = db.similarity_search(query, k=10)
+        if not results:
+            return []
+        # 2. Reranking
+        passages = [doc.page_content for doc in results]
+        ranks = reranker.rank(query, passages)
+        top_results = []
+        sorted_ranks = sorted(ranks, key=lambda x: x['score'], reverse=True)
+        for rank in sorted_ranks[:k]:
+            doc_index = rank['corpus_id']
+            doc = results[doc_index]
+            doc.metadata["relevance_score"] = round(rank['score'], 4)
+            top_results.append(doc)
+        return top_results
+    except Exception as e:
+        print(f"⚠️ Search Error (likely empty DB): {e}")
+        return []
 def list_documents(username):
     try:
         db = get_vectorstore(username)
+        # Check if empty before fetching to prevent errors
+        if db._collection.count() == 0:
+            return []
         data = db.get()
         metadatas = data['metadatas']
         for meta in metadatas:
             src = meta.get('source', 'unknown')
             filename = os.path.basename(src)
+            # FIX #2: Retrieve the strategy (Default to 'unknown' for old docs)
+            strat = meta.get('strategy', 'unknown')
             if src not in file_stats:
+                file_stats[src] = {
+                    'source': src,
+                    'filename': filename,
+                    'chunks': 0,
+                    'strategy': strat
+                }
             file_stats[src]['chunks'] += 1
         return list(file_stats.values())
         return []
 def delete_document(username, source_path):
     try:
         print(f"🗑️ Deleting {source_path} for {username}...")
         db = get_vectorstore(username)
         db.delete(where={"source": source_path})
         tracker.upload_user_db(username)
         return True, f"Deleted {os.path.basename(source_path)}"
     except Exception as e:
         return False, str(e)
 def reset_knowledge_base(username):
     try:
         db = get_vectorstore(username)
         db.delete_collection()