Spaces:
Running
Running
changed the retriever knowledge
Browse files- chatbot_retriever.py +26 -12
chatbot_retriever.py
CHANGED
|
@@ -262,8 +262,9 @@ def build_or_load_indexes(force_reindex: bool = False):
|
|
| 262 |
try:
|
| 263 |
index = faiss.read_index(FAISS_INDEX_PATH)
|
| 264 |
with open(FAISS_META_PATH, "rb") as f:
|
| 265 |
-
|
| 266 |
texts = meta.get("texts", corpus_texts)
|
|
|
|
| 267 |
try:
|
| 268 |
index.nprobe = FAISS_NPROBE
|
| 269 |
except Exception:
|
|
@@ -348,7 +349,10 @@ def build_or_load_indexes(force_reindex: bool = False):
|
|
| 348 |
try:
|
| 349 |
faiss.write_index(index, FAISS_INDEX_PATH)
|
| 350 |
with open(FAISS_META_PATH, "wb") as f:
|
| 351 |
-
pickle.dump({
|
|
|
|
|
|
|
|
|
|
| 352 |
logger.info("FAISS index saved to %s (entries=%d)", FAISS_INDEX_PATH, total)
|
| 353 |
except Exception:
|
| 354 |
logger.exception("Failed to persist FAISS index on disk")
|
|
@@ -391,8 +395,9 @@ def _faiss_search(query: str, top_k: int = TOP_K_DOCS, subject: Optional[str] =
|
|
| 391 |
if idx < 0 or idx >= len(texts):
|
| 392 |
continue
|
| 393 |
meta = metadatas[idx]
|
| 394 |
-
|
| 395 |
-
|
|
|
|
| 396 |
score_like = float(-dist)
|
| 397 |
results.append((score_like, meta, texts[idx]))
|
| 398 |
if len(results) >= top_k:
|
|
@@ -433,13 +438,23 @@ def hybrid_retrieve(query: str, subject: Optional[str] = None, top_k: int = TOP_
|
|
| 433 |
merged_texts = []
|
| 434 |
merged_meta = []
|
| 435 |
for score, meta, text in results_bm25:
|
| 436 |
-
if text and text.strip()
|
| 437 |
merged_texts.append(text)
|
| 438 |
-
merged_meta.append({
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 439 |
for score, meta, text in results_faiss:
|
| 440 |
-
if text and text.strip()
|
| 441 |
merged_texts.append(text)
|
| 442 |
-
merged_meta.append({
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 443 |
|
| 444 |
# compose context parts with headers
|
| 445 |
context_parts = []
|
|
@@ -457,13 +472,12 @@ def hybrid_retrieve(query: str, subject: Optional[str] = None, top_k: int = TOP_
|
|
| 457 |
|
| 458 |
|
| 459 |
# ---------- retrieve_node (for reuse) ----------
|
| 460 |
-
def _last_n_user_messages(rows: List[tuple], n: int =
|
| 461 |
-
"""Return only the latest user message for retrieval context."""
|
| 462 |
users = [r[1] for r in rows if r[0] == "user"]
|
| 463 |
-
return users[-
|
| 464 |
|
| 465 |
def retrieve_node_from_rows(rows: List[tuple], top_k: int = TOP_K_DOCS) -> Dict[str, Any]:
|
| 466 |
-
last_users = _last_n_user_messages(rows, n=
|
| 467 |
current_query = " ".join(last_users).strip() if last_users else ""
|
| 468 |
if not current_query:
|
| 469 |
return {"context": None, "direct": False}
|
|
|
|
| 262 |
try:
|
| 263 |
index = faiss.read_index(FAISS_INDEX_PATH)
|
| 264 |
with open(FAISS_META_PATH, "rb") as f:
|
| 265 |
+
meta = pickle.load(f)
|
| 266 |
texts = meta.get("texts", corpus_texts)
|
| 267 |
+
metadatas = meta.get("metadatas", metadatas)
|
| 268 |
try:
|
| 269 |
index.nprobe = FAISS_NPROBE
|
| 270 |
except Exception:
|
|
|
|
| 349 |
try:
|
| 350 |
faiss.write_index(index, FAISS_INDEX_PATH)
|
| 351 |
with open(FAISS_META_PATH, "wb") as f:
|
| 352 |
+
pickle.dump({
|
| 353 |
+
"texts": corpus_texts,
|
| 354 |
+
"metadatas": metadatas
|
| 355 |
+
}, f)
|
| 356 |
logger.info("FAISS index saved to %s (entries=%d)", FAISS_INDEX_PATH, total)
|
| 357 |
except Exception:
|
| 358 |
logger.exception("Failed to persist FAISS index on disk")
|
|
|
|
| 395 |
if idx < 0 or idx >= len(texts):
|
| 396 |
continue
|
| 397 |
meta = metadatas[idx]
|
| 398 |
+
# subject filtering disabled because it blocks many relevant chunks
|
| 399 |
+
# if subject and meta.get("subject") != subject:
|
| 400 |
+
# continue
|
| 401 |
score_like = float(-dist)
|
| 402 |
results.append((score_like, meta, texts[idx]))
|
| 403 |
if len(results) >= top_k:
|
|
|
|
| 438 |
merged_texts = []
|
| 439 |
merged_meta = []
|
| 440 |
for score, meta, text in results_bm25:
|
| 441 |
+
if text and text.strip():
|
| 442 |
merged_texts.append(text)
|
| 443 |
+
merged_meta.append({
|
| 444 |
+
"source": meta.get("filename"),
|
| 445 |
+
"subject": meta.get("subject"),
|
| 446 |
+
"score": score
|
| 447 |
+
})
|
| 448 |
+
|
| 449 |
for score, meta, text in results_faiss:
|
| 450 |
+
if text and text.strip():
|
| 451 |
merged_texts.append(text)
|
| 452 |
+
merged_meta.append({
|
| 453 |
+
"source": meta.get("filename") if isinstance(meta, dict) else None,
|
| 454 |
+
"subject": meta.get("subject") if isinstance(meta, dict) else None,
|
| 455 |
+
"score": score
|
| 456 |
+
})
|
| 457 |
+
|
| 458 |
|
| 459 |
# compose context parts with headers
|
| 460 |
context_parts = []
|
|
|
|
| 472 |
|
| 473 |
|
| 474 |
# ---------- retrieve_node (for reuse) ----------
|
| 475 |
+
def _last_n_user_messages(rows: List[tuple], n: int = 1) -> List[str]:
|
|
|
|
| 476 |
users = [r[1] for r in rows if r[0] == "user"]
|
| 477 |
+
return users[-1:] # always return ONLY the latest user query # only keep the last one
|
| 478 |
|
| 479 |
def retrieve_node_from_rows(rows: List[tuple], top_k: int = TOP_K_DOCS) -> Dict[str, Any]:
|
| 480 |
+
last_users = _last_n_user_messages(rows, n=1)
|
| 481 |
current_query = " ".join(last_users).strip() if last_users else ""
|
| 482 |
if not current_query:
|
| 483 |
return {"context": None, "direct": False}
|