07Codex07 commited on
Commit
7fdfc47
·
1 Parent(s): abb9880

changed the retriever knowledge

Browse files
Files changed (1) hide show
  1. chatbot_retriever.py +26 -12
chatbot_retriever.py CHANGED
@@ -262,8 +262,9 @@ def build_or_load_indexes(force_reindex: bool = False):
262
  try:
263
  index = faiss.read_index(FAISS_INDEX_PATH)
264
  with open(FAISS_META_PATH, "rb") as f:
265
- meta = pickle.load(f)
266
  texts = meta.get("texts", corpus_texts)
 
267
  try:
268
  index.nprobe = FAISS_NPROBE
269
  except Exception:
@@ -348,7 +349,10 @@ def build_or_load_indexes(force_reindex: bool = False):
348
  try:
349
  faiss.write_index(index, FAISS_INDEX_PATH)
350
  with open(FAISS_META_PATH, "wb") as f:
351
- pickle.dump({"texts": corpus_texts}, f)
 
 
 
352
  logger.info("FAISS index saved to %s (entries=%d)", FAISS_INDEX_PATH, total)
353
  except Exception:
354
  logger.exception("Failed to persist FAISS index on disk")
@@ -391,8 +395,9 @@ def _faiss_search(query: str, top_k: int = TOP_K_DOCS, subject: Optional[str] =
391
  if idx < 0 or idx >= len(texts):
392
  continue
393
  meta = metadatas[idx]
394
- if subject and meta.get("subject") != subject:
395
- continue
 
396
  score_like = float(-dist)
397
  results.append((score_like, meta, texts[idx]))
398
  if len(results) >= top_k:
@@ -433,13 +438,23 @@ def hybrid_retrieve(query: str, subject: Optional[str] = None, top_k: int = TOP_
433
  merged_texts = []
434
  merged_meta = []
435
  for score, meta, text in results_bm25:
436
- if text and text.strip() and text not in merged_texts:
437
  merged_texts.append(text)
438
- merged_meta.append({"source": meta.get("filename"), "subject": meta.get("subject"), "score": score})
 
 
 
 
 
439
  for score, meta, text in results_faiss:
440
- if text and text.strip() and text not in merged_texts:
441
  merged_texts.append(text)
442
- merged_meta.append({"source": meta.get("filename") if isinstance(meta, dict) else None, "subject": meta.get("subject") if isinstance(meta, dict) else None, "score": score})
 
 
 
 
 
443
 
444
  # compose context parts with headers
445
  context_parts = []
@@ -457,13 +472,12 @@ def hybrid_retrieve(query: str, subject: Optional[str] = None, top_k: int = TOP_
457
 
458
 
459
  # ---------- retrieve_node (for reuse) ----------
460
- def _last_n_user_messages(rows: List[tuple], n: int = 3) -> List[str]:
461
- """Return only the latest user message for retrieval context."""
462
  users = [r[1] for r in rows if r[0] == "user"]
463
- return users[-n:] # only keep the last one
464
 
465
  def retrieve_node_from_rows(rows: List[tuple], top_k: int = TOP_K_DOCS) -> Dict[str, Any]:
466
- last_users = _last_n_user_messages(rows, n=3)
467
  current_query = " ".join(last_users).strip() if last_users else ""
468
  if not current_query:
469
  return {"context": None, "direct": False}
 
262
  try:
263
  index = faiss.read_index(FAISS_INDEX_PATH)
264
  with open(FAISS_META_PATH, "rb") as f:
265
+ meta = pickle.load(f)
266
  texts = meta.get("texts", corpus_texts)
267
+ metadatas = meta.get("metadatas", metadatas)
268
  try:
269
  index.nprobe = FAISS_NPROBE
270
  except Exception:
 
349
  try:
350
  faiss.write_index(index, FAISS_INDEX_PATH)
351
  with open(FAISS_META_PATH, "wb") as f:
352
+ pickle.dump({
353
+ "texts": corpus_texts,
354
+ "metadatas": metadatas
355
+ }, f)
356
  logger.info("FAISS index saved to %s (entries=%d)", FAISS_INDEX_PATH, total)
357
  except Exception:
358
  logger.exception("Failed to persist FAISS index on disk")
 
395
  if idx < 0 or idx >= len(texts):
396
  continue
397
  meta = metadatas[idx]
398
+ # subject filtering disabled because it blocks many relevant chunks
399
+ # if subject and meta.get("subject") != subject:
400
+ # continue
401
  score_like = float(-dist)
402
  results.append((score_like, meta, texts[idx]))
403
  if len(results) >= top_k:
 
438
  merged_texts = []
439
  merged_meta = []
440
  for score, meta, text in results_bm25:
441
+ if text and text.strip():
442
  merged_texts.append(text)
443
+ merged_meta.append({
444
+ "source": meta.get("filename"),
445
+ "subject": meta.get("subject"),
446
+ "score": score
447
+ })
448
+
449
  for score, meta, text in results_faiss:
450
+ if text and text.strip():
451
  merged_texts.append(text)
452
+ merged_meta.append({
453
+ "source": meta.get("filename") if isinstance(meta, dict) else None,
454
+ "subject": meta.get("subject") if isinstance(meta, dict) else None,
455
+ "score": score
456
+ })
457
+
458
 
459
  # compose context parts with headers
460
  context_parts = []
 
472
 
473
 
474
  # ---------- retrieve_node (for reuse) ----------
475
+ def _last_n_user_messages(rows: List[tuple], n: int = 1) -> List[str]:
 
476
  users = [r[1] for r in rows if r[0] == "user"]
477
+ return users[-1:] # always return ONLY the latest user query # only keep the last one
478
 
479
  def retrieve_node_from_rows(rows: List[tuple], top_k: int = TOP_K_DOCS) -> Dict[str, Any]:
480
+ last_users = _last_n_user_messages(rows, n=1)
481
  current_query = " ".join(last_users).strip() if last_users else ""
482
  if not current_query:
483
  return {"context": None, "direct": False}