Spaces:

Kalpokoch
/

ChatbotDemo

Sleeping

App Files Files

Kalpokoch commited on Aug 18

Commit

01687a9

verified ·

1 Parent(s): bbdcb91

Update app/app.py

Browse files

Files changed (1) hide show

app/app.py +132 -98

app/app.py CHANGED Viewed

@@ -9,21 +9,6 @@ from pydantic import BaseModel
 from llama_cpp import Llama
 from app.policy_vector_db import PolicyVectorDB, ensure_db_populated
-# -----------------------------
-# ✅ Optimized Configuration for Hugging Face Free Tier
-# -----------------------------
-DB_PERSIST_DIRECTORY = os.getenv("DB_PERSIST_DIRECTORY", "/app/vector_database")
-CHUNKS_FILE_PATH = os.getenv("CHUNKS_FILE_PATH", "/app/granular_chunks_final.jsonl")
-MODEL_PATH = os.getenv("MODEL_PATH", "/app/tinyllama_dop_q4_k_m.gguf")
-LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "60"))  # Reduced timeout for free tier
-RELEVANCE_THRESHOLD = float(os.getenv("RELEVANCE_THRESHOLD", "0.3"))
-TOP_K_SEARCH = int(os.getenv("TOP_K_SEARCH", "3"))  # Reduced for efficiency
-TOP_K_CONTEXT = int(os.getenv("TOP_K_CONTEXT", "2"))
-# ✅ Single-threaded CPU optimization
-LLM_THREADS = 1  # Single thread for free tier
-MAX_CONCURRENT_REQUESTS = 1  # Process one request at a time
 # -----------------------------
 # ✅ Logging Configuration
 # -----------------------------
@@ -36,12 +21,25 @@ class RequestIdAdapter(logging.LoggerAdapter):
 logger = logging.getLogger("app")
 # -----------------------------
-# ✅ Initialize FastAPI App with Request Limiting
 # -----------------------------
-app = FastAPI(title="NEEPCO DoP RAG Chatbot", version="2.4.0")
-# ✅ Request queue to ensure single processing
-request_semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
 @app.middleware("http")
 async def add_request_id(request: Request, call_next):
@@ -73,23 +71,19 @@ except Exception as e:
     db_ready = False
 # -----------------------------
-# ✅ Memory-Optimized GGUF Model Loading for Free Tier
 # -----------------------------
-logger.info(f"Loading GGUF model for single-threaded processing from: {MODEL_PATH}")
 try:
     llm = Llama(
         model_path=MODEL_PATH,
-        n_ctx=2048,  # Reduced context size for memory efficiency
-        n_threads=LLM_THREADS,  # Single thread
-        n_batch=256,  # Smaller batch size for memory efficiency
-        use_mlock=False,  # Disable memory locking
-        use_mmap=True,  # Enable memory mapping for efficiency
-        verbose=False,
-        n_gpu_layers=0,  # CPU only
-        f16_kv=True,  # Use 16-bit for key-value cache to save memory
-        low_vram=True,  # Enable low VRAM mode for better memory usage
     )
-    logger.info("GGUF model loaded successfully for single-threaded processing.")
     model_ready = True
 except Exception as e:
     logger.error(f"FATAL: Failed to load GGUF model: {e}", exc_info=True)
@@ -111,7 +105,7 @@ class Feedback(BaseModel):
     comment: str | None = None
 # -----------------------------
-# ✅ Query Processing Functions (Unchanged)
 # -----------------------------
 def classify_query_type(question: str) -> str:
     """Classify the type of query to choose appropriate search strategy."""
@@ -210,33 +204,34 @@ Your task is to answer the user's question based ONLY on the provided context.
     return prompt
 # -----------------------------
-# ✅ Synchronous LLM Response Generation (No Threading)
 # -----------------------------
-def generate_llm_response_sync(prompt: str, request_id: str) -> str:
-    """Synchronous LLM generation optimized for single-threaded processing."""
-    try:
-        # ✅ Optimized parameters for free tier CPU
-        response = llm(
             prompt,
-            max_tokens=1024,  # Reduced token limit for faster processing
             stop=["###", "Question:", "Context:", "</s>"],
-            temperature=0.1,  # Lower temperature for consistent responses
-            top_p=0.9,
-            repeat_penalty=1.1,
             echo=False
         )
-        if response and "choices" in response and len(response["choices"]) > 0:
-            return response["choices"][0]["text"].strip()
-        else:
-            raise ValueError("Empty or invalid response from LLM")
-    except Exception as e:
-        logger.error(f"LLM generation error for request {request_id}: {e}")
-        raise
 # -----------------------------
-# ✅ Endpoints with Request Limiting
 # -----------------------------
 def get_logger_adapter(request: Request):
     return RequestIdAdapter(logger, {'request_id': getattr(request.state, 'request_id', 'N/A')})
@@ -244,10 +239,10 @@ def get_logger_adapter(request: Request):
 @app.get("/")
 async def root():
     return {
-        "status": "✅ Server is running on Hugging Face Free Tier",
-        "mode": "Single-threaded processing",
-        "max_concurrent_requests": MAX_CONCURRENT_REQUESTS,
-        "llm_threads": LLM_THREADS
     }
 @app.get("/health")
@@ -256,8 +251,7 @@ async def health_check():
         "status": "ok",
         "database_status": "ready" if db_ready else "error",
         "model_status": "ready" if model_ready else "error",
-        "processing_mode": "single_threaded",
-        "max_concurrent_requests": MAX_CONCURRENT_REQUESTS
     }
     if not db_ready or not model_ready:
         raise HTTPException(status_code=503, detail=status)
@@ -265,14 +259,19 @@ async def health_check():
 @app.post("/chat")
 async def chat(query: Query, request: Request):
-    # ✅ Acquire semaphore to ensure single request processing
-    async with request_semaphore:
         adapter = get_logger_adapter(request)
-        adapter.info("Processing request (single-threaded mode)")
         question_lower = query.question.strip().lower()
-        # Greeting handling
         greeting_keywords = ["hello", "hi", "hey", "what can you do", "who are you"]
         if question_lower in greeting_keywords:
             adapter.info(f"Handling a greeting or introductory query: '{query.question}'")
@@ -300,22 +299,32 @@ async def chat(query: Query, request: Request):
         search_results = []
         if query_type == "monetary":
             amount = extract_monetary_amount(query.question)
             if amount:
                 adapter.info(f"Extracted monetary amount: ₹{amount}")
-                monetary_results = db.search_by_amount(amount, comparison=">=", top_k=TOP_K_SEARCH)
-                if monetary_results:
-                    search_results = monetary_results
-                    adapter.info(f"Found {len(search_results)} results using monetary search")
         if not search_results:
-            search_results = db.search_with_context(
-                query.question,
-                top_k=TOP_K_SEARCH,
-                include_related=True
-            )
-            adapter.info(f"Found {len(search_results)} results using semantic search with context")
         if not search_results:
             adapter.warning("No relevant context found in vector DB.")
@@ -326,43 +335,62 @@ async def chat(query: Query, request: Request):
                 "answer": "Sorry, I could not find a relevant policy to answer that question. Please try rephrasing or ask about specific delegation limits, approval authorities, or procedures."
             }
-        # Log search results with metadata
-        result_info = []
-        for i, result in enumerate(search_results):
-            metadata = result.get('metadata', {})
-            role = metadata.get('role', 'N/A')
-            section = metadata.get('section', 'N/A')
-            score = result.get('relevance_score', 0)
-            result_info.append(f"#{i+1}: Score={score:.3f}, Role={role}, Section={section}")
-        adapter.info(f"Search results: {' | '.join(result_info)}")
-        # Prepare context with metadata
         context_chunks = []
         for result in search_results[:TOP_K_CONTEXT]:
             chunk_text = result['text']
             metadata = result.get('metadata', {})
-            if metadata.get('section') or metadata.get('role'):
                 metadata_prefix = f"[Section: {metadata.get('section', 'N/A')}, Role: {metadata.get('role', 'N/A')}] "
                 chunk_text = metadata_prefix + chunk_text
             context_chunks.append(chunk_text)
         context = "\n---\n".join(context_chunks)
-        prompt = build_enhanced_prompt(query.question, context, query_type, search_results)
-        # Generate response synchronously
         answer = "An error occurred while processing your request."
         try:
-            adapter.info(f"Sending prompt to LLM for {query_type} query (synchronous processing)...")
-            # ✅ Direct synchronous call - no threading or async execution
-            raw_answer = generate_llm_response_sync(prompt, request.state.request_id)
             adapter.info(f"LLM generation successful. Raw response: {raw_answer[:250]}...")
-            # Post-processing
             if '|' in raw_answer:
                 adapter.info("Pipe separator found. Formatting response as a bulleted list.")
                 items = raw_answer.split('|')
@@ -371,10 +399,14 @@ async def chat(query: Query, request: Request):
             else:
                 answer = raw_answer.strip()
                 if query_type == "monetary" and "₹" not in answer and extract_monetary_amount(query.question):
                     amount = extract_monetary_amount(query.question)
                     answer = f"For amounts of ₹{amount:,.0f}:\n\n{answer}"
         except Exception as e:
             adapter.error(f"An unexpected error occurred during LLM generation: {e}", exc_info=True)
             answer = "Sorry, an unexpected error occurred while generating a response."
@@ -385,10 +417,13 @@ async def chat(query: Query, request: Request):
             "question": query.question,
             "context_used": context,
             "answer": answer,
-            "query_type": query_type,
-            "search_strategy": "monetary" if query_type == "monetary" and extract_monetary_amount(query.question) else "semantic_with_context",
-            "processing_mode": "single_threaded"
         }
 @app.post("/feedback")
 async def collect_feedback(feedback: Feedback, request: Request):
@@ -405,7 +440,6 @@ async def collect_feedback(feedback: Feedback, request: Request):
     adapter.info(json.dumps(feedback_log))
     return {"status": "✅ Feedback recorded. Thank you!"}
-# ✅ No cleanup needed for single-threaded processing
 @app.on_event("shutdown")
 async def shutdown_event():
-    logger.info("Application shutting down (single-threaded mode).")

 from llama_cpp import Llama
 from app.policy_vector_db import PolicyVectorDB, ensure_db_populated
 # -----------------------------
 # ✅ Logging Configuration
 # -----------------------------
 logger = logging.getLogger("app")
 # -----------------------------
+# ✅ Configuration - Restored Original Efficient Settings
 # -----------------------------
+DB_PERSIST_DIRECTORY = os.getenv("DB_PERSIST_DIRECTORY", "/app/vector_database")
+CHUNKS_FILE_PATH = os.getenv("CHUNKS_FILE_PATH", "/app/granular_chunks_final.jsonl")
+MODEL_PATH = os.getenv("MODEL_PATH", "/app/tinyllama_dop_q4_k_m.gguf")
+LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "90"))  # Back to original timeout
+RELEVANCE_THRESHOLD = float(os.getenv("RELEVANCE_THRESHOLD", "0.3"))
+TOP_K_SEARCH = int(os.getenv("TOP_K_SEARCH", "3"))  # Keep reduced for efficiency
+TOP_K_CONTEXT = int(os.getenv("TOP_K_CONTEXT", "1"))  # Keep reduced for efficiency
+# ✅ Single request processing without blocking semaphore
+MAX_CONCURRENT_REQUESTS = 1
+request_in_progress = False
+request_lock = asyncio.Lock()
+# -----------------------------
+# ✅ Initialize FastAPI App
+# -----------------------------
+app = FastAPI(title="NEEPCO DoP RAG Chatbot", version="2.5.0")
 @app.middleware("http")
 async def add_request_id(request: Request, call_next):
     db_ready = False
 # -----------------------------
+# ✅ Load GGUF Model - Restored Original Efficient Settings
 # -----------------------------
+logger.info(f"Loading GGUF model from: {MODEL_PATH}")
 try:
     llm = Llama(
         model_path=MODEL_PATH,
+        n_ctx=4096,  # ✅ Restored original context size
+        n_threads=4,  # ✅ Restored original thread count for efficient CPU usage
+        n_batch=512,  # ✅ Restored original batch size
+        use_mlock=True,  # ✅ Restored original memory settings
+        verbose=False
     )
+    logger.info("GGUF model loaded successfully.")
     model_ready = True
 except Exception as e:
     logger.error(f"FATAL: Failed to load GGUF model: {e}", exc_info=True)
     comment: str | None = None
 # -----------------------------
+# ✅ Enhanced Query Processing Functions
 # -----------------------------
 def classify_query_type(question: str) -> str:
     """Classify the type of query to choose appropriate search strategy."""
     return prompt
 # -----------------------------
+# ✅ Efficient LLM Response Generation - Restored Original Async Pattern
 # -----------------------------
+async def generate_llm_response(prompt: str, request_id: str):
+    """Async LLM generation using original efficient pattern."""
+    loop = asyncio.get_running_loop()
+    def llm_call():
+        return llm(
             prompt,
+            max_tokens=2048,  # ✅ Restored original token limit
             stop=["###", "Question:", "Context:", "</s>"],
+            temperature=0.05,  # ✅ Restored original temperature
             echo=False
         )
+    # ✅ Use original async executor pattern for efficient CPU usage
+    response = await loop.run_in_executor(None, llm_call)
+    if response and "choices" in response and len(response["choices"]) > 0:
+        answer = response["choices"][0]["text"].strip()
+        if not answer:
+            raise ValueError("Empty response from LLM")
+        return answer
+    else:
+        raise ValueError("Invalid response from LLM")
 # -----------------------------
+# ✅ Endpoints with Lightweight Request Management
 # -----------------------------
 def get_logger_adapter(request: Request):
     return RequestIdAdapter(logger, {'request_id': getattr(request.state, 'request_id', 'N/A')})
 @app.get("/")
 async def root():
     return {
+        "status": "✅ Server is running efficiently",
+        "mode": "CPU optimized for Hugging Face",
+        "model_loaded": model_ready,
+        "db_ready": db_ready
     }
 @app.get("/health")
         "status": "ok",
         "database_status": "ready" if db_ready else "error",
         "model_status": "ready" if model_ready else "error",
+        "processing_mode": "efficient_cpu_usage"
     }
     if not db_ready or not model_ready:
         raise HTTPException(status_code=503, detail=status)
 @app.post("/chat")
 async def chat(query: Query, request: Request):
+    global request_in_progress
+    # ✅ Lightweight request management - reject if busy instead of blocking
+    async with request_lock:
+        if request_in_progress:
+            raise HTTPException(status_code=429, detail="Server is busy processing another request. Please try again in a moment.")
+        request_in_progress = True
+    try:
         adapter = get_logger_adapter(request)
         question_lower = query.question.strip().lower()
+        # --- GREETING & INTRO HANDLING ---
         greeting_keywords = ["hello", "hi", "hey", "what can you do", "who are you"]
         if question_lower in greeting_keywords:
             adapter.info(f"Handling a greeting or introductory query: '{query.question}'")
         search_results = []
+        # Enhanced search strategy
         if query_type == "monetary":
             amount = extract_monetary_amount(query.question)
             if amount:
                 adapter.info(f"Extracted monetary amount: ₹{amount}")
+                try:
+                    monetary_results = db.search_by_amount(amount, comparison=">=", top_k=TOP_K_SEARCH)
+                    if monetary_results:
+                        search_results = monetary_results
+                        adapter.info(f"Found {len(search_results)} results using monetary search")
+                except:
+                    adapter.info("Monetary search not available, falling back to semantic search")
         if not search_results:
+            # Use enhanced search if available, otherwise fallback to basic search
+            try:
+                search_results = db.search_with_context(
+                    query.question,
+                    top_k=TOP_K_SEARCH,
+                    include_related=True
+                )
+                adapter.info(f"Found {len(search_results)} results using enhanced semantic search")
+            except:
+                # Fallback to basic search
+                search_results = db.search(query.question, top_k=TOP_K_SEARCH)
+                adapter.info(f"Found {len(search_results)} results using basic search")
         if not search_results:
             adapter.warning("No relevant context found in vector DB.")
                 "answer": "Sorry, I could not find a relevant policy to answer that question. Please try rephrasing or ask about specific delegation limits, approval authorities, or procedures."
             }
+        # Log search results
+        scores = [f"{result.get('relevance_score', 0):.4f}" for result in search_results]
+        adapter.info(f"Found {len(search_results)} relevant chunks with scores: {scores}")
+        # Prepare context with metadata if available
         context_chunks = []
         for result in search_results[:TOP_K_CONTEXT]:
             chunk_text = result['text']
             metadata = result.get('metadata', {})
+            if metadata and (metadata.get('section') or metadata.get('role')):
                 metadata_prefix = f"[Section: {metadata.get('section', 'N/A')}, Role: {metadata.get('role', 'N/A')}] "
                 chunk_text = metadata_prefix + chunk_text
             context_chunks.append(chunk_text)
         context = "\n---\n".join(context_chunks)
+        # Build prompt - use enhanced if search results have metadata, otherwise simple
+        if any(result.get('metadata') for result in search_results):
+            prompt = build_enhanced_prompt(query.question, context, query_type, search_results)
+            adapter.info(f"Using enhanced prompt for {query_type} query")
+        else:
+            # Fallback to original simple prompt
+            prompt = f"""<|system|>
+You are a precise and factual assistant for NEEPCO's Delegation of Powers (DoP) policy.
+Your task is to answer the user's question based ONLY on the provided context.
+- **Formatting Rule:** If the answer contains a list of items or steps, you **MUST** separate each item with a pipe symbol (`|`). For example: `First item|Second item|Third item`.
+- **Content Rule:** If the information is not in the provided context, you **MUST** reply with the exact phrase: "The provided policy context does not contain information on this topic."
+</s>
+<|user|>
+### Relevant Context:
+```
+{context}
+```
+### Question:
+{query.question}
+</s>
+<|assistant|>
+### Detailed Answer:
+"""
+            adapter.info("Using original simple prompt")
+        # Generate response using original efficient async pattern
         answer = "An error occurred while processing your request."
         try:
+            adapter.info("Sending prompt to LLM for generation...")
+            raw_answer = await asyncio.wait_for(
+                generate_llm_response(prompt, request.state.request_id),
+                timeout=LLM_TIMEOUT_SECONDS
+            )
             adapter.info(f"LLM generation successful. Raw response: {raw_answer[:250]}...")
+            # Post-processing logic
             if '|' in raw_answer:
                 adapter.info("Pipe separator found. Formatting response as a bulleted list.")
                 items = raw_answer.split('|')
             else:
                 answer = raw_answer.strip()
+                # Add monetary context if needed
                 if query_type == "monetary" and "₹" not in answer and extract_monetary_amount(query.question):
                     amount = extract_monetary_amount(query.question)
                     answer = f"For amounts of ₹{amount:,.0f}:\n\n{answer}"
+        except asyncio.TimeoutError:
+            adapter.warning(f"LLM generation timed out after {LLM_TIMEOUT_SECONDS} seconds.")
+            answer = "Sorry, the request took too long to process. Please try again with a simpler question."
         except Exception as e:
             adapter.error(f"An unexpected error occurred during LLM generation: {e}", exc_info=True)
             answer = "Sorry, an unexpected error occurred while generating a response."
             "question": query.question,
             "context_used": context,
             "answer": answer,
+            "query_type": query_type if 'query_type' in locals() else "general"
         }
+    finally:
+        # ✅ Always release the lock
+        async with request_lock:
+            request_in_progress = False
 @app.post("/feedback")
 async def collect_feedback(feedback: Feedback, request: Request):
     adapter.info(json.dumps(feedback_log))
     return {"status": "✅ Feedback recorded. Thank you!"}
 @app.on_event("shutdown")
 async def shutdown_event():
+    logger.info("Application shutting down.")