Spaces:
Sleeping
Sleeping
Update app/app.py
Browse files- app/app.py +35 -31
app/app.py
CHANGED
|
@@ -4,12 +4,12 @@ from llama_cpp import Llama
|
|
| 4 |
import logging
|
| 5 |
from app.policy_vector_db import PolicyVectorDB, ensure_db_populated
|
| 6 |
import asyncio
|
| 7 |
-
import os
|
| 8 |
|
| 9 |
# -----------------------------
|
| 10 |
# ✅ Logging Configuration
|
| 11 |
# -----------------------------
|
| 12 |
-
logging.basicConfig(level=logging.INFO)
|
| 13 |
logger = logging.getLogger("app")
|
| 14 |
|
| 15 |
# -----------------------------
|
|
@@ -46,7 +46,7 @@ db = PolicyVectorDB(
|
|
| 46 |
relevance_threshold=0 # Low for more inclusive matching
|
| 47 |
)
|
| 48 |
if not ensure_db_populated(db, CHUNKS_FILE_PATH):
|
| 49 |
-
logger.warning("[WARNING] DB not populated. Chunks file may be missing.")
|
| 50 |
else:
|
| 51 |
logger.info("[INFO] Vector DB ready.")
|
| 52 |
|
|
@@ -55,11 +55,16 @@ else:
|
|
| 55 |
# -----------------------------
|
| 56 |
MODEL_PATH = "/app/tinyllama_dop_q4_k_m.gguf"
|
| 57 |
logger.info(f"[INFO] Loading GGUF model from: {MODEL_PATH}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
llm = Llama(
|
| 59 |
model_path=MODEL_PATH,
|
| 60 |
-
n_ctx=512, #
|
| 61 |
-
n_threads=2,
|
| 62 |
-
n_batch=16,
|
| 63 |
use_mlock=False,
|
| 64 |
verbose=False
|
| 65 |
)
|
|
@@ -76,24 +81,20 @@ class Query(BaseModel):
|
|
| 76 |
# -----------------------------
|
| 77 |
|
| 78 |
# Define a reasonable timeout for LLM inference (e.g., 30 seconds)
|
| 79 |
-
#
|
| 80 |
LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "30")) # Default to 30 seconds
|
| 81 |
|
| 82 |
async def generate_llm_response(prompt: str):
|
| 83 |
-
"""
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
return answer
|
| 94 |
-
except Exception as e:
|
| 95 |
-
logger.error(f"[ERROR] LLM generation failed: {str(e)}")
|
| 96 |
-
raise # Re-raise to be caught by the timeout mechanism or outer try/except
|
| 97 |
|
| 98 |
@app.post("/chat")
|
| 99 |
async def chat(query: Query):
|
|
@@ -110,18 +111,20 @@ async def chat(query: Query):
|
|
| 110 |
reverse=True
|
| 111 |
)
|
| 112 |
|
| 113 |
-
# 🪵 Log context scores
|
| 114 |
for i, r in enumerate(search_results):
|
| 115 |
-
logger.info(f"[DEBUG] Chunk {i+1} | Score: {r['relevance_score']:.4f} | Snippet: {r['text'][:80]}")
|
| 116 |
|
| 117 |
if not filtered:
|
|
|
|
| 118 |
return {
|
| 119 |
"question": question,
|
| 120 |
"context_used": "No relevant context found above the relevance threshold.",
|
| 121 |
-
"answer": "Sorry, I need more detail in the question to provide an answer."
|
| 122 |
}
|
| 123 |
|
| 124 |
context = filtered[0]["text"]
|
|
|
|
| 125 |
|
| 126 |
# ✨ Prompt Template
|
| 127 |
prompt = f"""You are a helpful assistant trained on NEEPCO Delegation of Powers (DoP) policies.Only use the context provided. Be precise.### Relevant Context:{context}### Question:{question}### Answer:"""
|
|
@@ -129,18 +132,19 @@ async def chat(query: Query):
|
|
| 129 |
# 🔮 Run LLM with safety and timeout
|
| 130 |
answer = "Sorry, I couldn't process your request right now. Please try again later or rephrase your question."
|
| 131 |
try:
|
| 132 |
-
#
|
| 133 |
-
# If generate_llm_response doesn't complete within LLM_TIMEOUT_SECONDS,
|
| 134 |
-
# asyncio.TimeoutError will be raised.
|
| 135 |
answer = await asyncio.wait_for(generate_llm_response(prompt), timeout=LLM_TIMEOUT_SECONDS)
|
| 136 |
except asyncio.TimeoutError:
|
| 137 |
-
logger.warning(f"[TIMEOUT] LLM generation timed out after {LLM_TIMEOUT_SECONDS} seconds for question: {question}")
|
| 138 |
answer = "Sorry, I couldn't process your request right now. The operation took too long. Please try again later or rephrase your question."
|
|
|
|
|
|
|
|
|
|
| 139 |
except Exception as e:
|
| 140 |
-
logger.error(f"[ERROR] An unexpected error occurred during LLM generation: {str(e)}")
|
| 141 |
-
answer = "Sorry, I couldn't process your request right now. Please try again later or rephrase your question."
|
| 142 |
|
| 143 |
-
logger.info(f"[RESPONSE] {answer}")
|
| 144 |
return {
|
| 145 |
"question": question,
|
| 146 |
"context_used": context,
|
|
|
|
| 4 |
import logging
|
| 5 |
from app.policy_vector_db import PolicyVectorDB, ensure_db_populated
|
| 6 |
import asyncio
|
| 7 |
+
import os
|
| 8 |
|
| 9 |
# -----------------------------
|
| 10 |
# ✅ Logging Configuration
|
| 11 |
# -----------------------------
|
| 12 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
| 13 |
logger = logging.getLogger("app")
|
| 14 |
|
| 15 |
# -----------------------------
|
|
|
|
| 46 |
relevance_threshold=0 # Low for more inclusive matching
|
| 47 |
)
|
| 48 |
if not ensure_db_populated(db, CHUNKS_FILE_PATH):
|
| 49 |
+
logger.warning("[WARNING] DB not populated. Chunks file may be missing or empty. RAG will not function correctly.")
|
| 50 |
else:
|
| 51 |
logger.info("[INFO] Vector DB ready.")
|
| 52 |
|
|
|
|
| 55 |
# -----------------------------
|
| 56 |
MODEL_PATH = "/app/tinyllama_dop_q4_k_m.gguf"
|
| 57 |
logger.info(f"[INFO] Loading GGUF model from: {MODEL_PATH}")
|
| 58 |
+
|
| 59 |
+
# --- IMPORTANT: Experiment with these Llama parameters based on your CPU performance ---
|
| 60 |
+
# n_ctx: Context window size. Adjust if your typical prompts are significantly shorter.
|
| 61 |
+
# n_threads: Number of CPU threads. Experiment with 1, 2, or 4 for best results on your specific CPU.
|
| 62 |
+
# n_batch: Batch size for internal processing. Experiment with 4, 8, or 16 for latency vs. throughput.
|
| 63 |
llm = Llama(
|
| 64 |
model_path=MODEL_PATH,
|
| 65 |
+
n_ctx=512, # Keep at 512, or reduce if your context is always short, e.g., 384
|
| 66 |
+
n_threads=2, # <--- EXPERIMENT HERE (try 1, 2, or 4)
|
| 67 |
+
n_batch=16, # <--- EXPERIMENT HERE (try 4, 8, or 16)
|
| 68 |
use_mlock=False,
|
| 69 |
verbose=False
|
| 70 |
)
|
|
|
|
| 81 |
# -----------------------------
|
| 82 |
|
| 83 |
# Define a reasonable timeout for LLM inference (e.g., 30 seconds)
|
| 84 |
+
# This can be configured via an environment variable in Hugging Face Spaces.
|
| 85 |
LLM_TIMEOUT_SECONDS = int(os.getenv("LLM_TIMEOUT_SECONDS", "30")) # Default to 30 seconds
|
| 86 |
|
| 87 |
async def generate_llm_response(prompt: str):
|
| 88 |
+
"""
|
| 89 |
+
Helper function to run synchronous LLM inference.
|
| 90 |
+
FastAPI's async def automatically runs blocking code in a thread pool,
|
| 91 |
+
making it compatible with asyncio.wait_for.
|
| 92 |
+
"""
|
| 93 |
+
response = llm(prompt, max_tokens=128, stop=["###"], temperature=0.2)
|
| 94 |
+
answer = response["choices"][0]["text"].strip()
|
| 95 |
+
if not answer:
|
| 96 |
+
raise ValueError("Empty response from LLM")
|
| 97 |
+
return answer
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
|
| 99 |
@app.post("/chat")
|
| 100 |
async def chat(query: Query):
|
|
|
|
| 111 |
reverse=True
|
| 112 |
)
|
| 113 |
|
| 114 |
+
# 🪵 Log context scores for debugging
|
| 115 |
for i, r in enumerate(search_results):
|
| 116 |
+
logger.info(f"[DEBUG] Chunk {i+1} | Score: {r['relevance_score']:.4f} | Snippet: {r['text'][:80]}...")
|
| 117 |
|
| 118 |
if not filtered:
|
| 119 |
+
logger.info("[RESPONSE] No relevant context found.")
|
| 120 |
return {
|
| 121 |
"question": question,
|
| 122 |
"context_used": "No relevant context found above the relevance threshold.",
|
| 123 |
+
"answer": "Sorry, I need more detail in the question to provide an answer based on the policies."
|
| 124 |
}
|
| 125 |
|
| 126 |
context = filtered[0]["text"]
|
| 127 |
+
logger.info(f"[INFO] Using top context (score: {filtered[0]['relevance_score']:.4f}): {context[:100]}...")
|
| 128 |
|
| 129 |
# ✨ Prompt Template
|
| 130 |
prompt = f"""You are a helpful assistant trained on NEEPCO Delegation of Powers (DoP) policies.Only use the context provided. Be precise.### Relevant Context:{context}### Question:{question}### Answer:"""
|
|
|
|
| 132 |
# 🔮 Run LLM with safety and timeout
|
| 133 |
answer = "Sorry, I couldn't process your request right now. Please try again later or rephrase your question."
|
| 134 |
try:
|
| 135 |
+
# Enforce timeout for LLM generation
|
|
|
|
|
|
|
| 136 |
answer = await asyncio.wait_for(generate_llm_response(prompt), timeout=LLM_TIMEOUT_SECONDS)
|
| 137 |
except asyncio.TimeoutError:
|
| 138 |
+
logger.warning(f"[TIMEOUT] LLM generation timed out after {LLM_TIMEOUT_SECONDS} seconds for question: '{question}'")
|
| 139 |
answer = "Sorry, I couldn't process your request right now. The operation took too long. Please try again later or rephrase your question."
|
| 140 |
+
except ValueError as e: # Catch explicit ValueError from generate_llm_response (e.g., empty response)
|
| 141 |
+
logger.error(f"[ERROR] LLM generation returned an invalid response: {str(e)} for question: '{question}'")
|
| 142 |
+
answer = "Sorry, I couldn't process your request right now. The model returned an invalid answer. Please try again later or rephrase your question."
|
| 143 |
except Exception as e:
|
| 144 |
+
logger.error(f"[ERROR] An unexpected error occurred during LLM generation: {str(e)} for question: '{question}'")
|
| 145 |
+
answer = "Sorry, I couldn't process your request right now. An unexpected error occurred. Please try again later or rephrase your question."
|
| 146 |
|
| 147 |
+
logger.info(f"[RESPONSE] Answered for '{question}': {answer[:100]}...") # Log beginning of answer
|
| 148 |
return {
|
| 149 |
"question": question,
|
| 150 |
"context_used": context,
|