chouchouvs commited on
Commit
849d996
·
verified ·
1 Parent(s): 75159bc

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +70 -27
main.py CHANGED
@@ -275,8 +275,8 @@ except Exception as e:
275
  # ======================================================================================
276
 
277
  class FileIn(BaseModel):
278
- path: str
279
- text: str
280
 
281
  class IndexRequest(BaseModel):
282
  project_id: str = Field(..., min_length=1)
@@ -495,7 +495,7 @@ def _clean_chunk_text(text: str) -> str:
495
  """
496
  if not text:
497
  return text
498
- t = text.strip()
499
 
500
  # retirer un suffixe typique: , "indexed_at": "2025-..."}}
501
  t = re.sub(r',\s*"indexed_at"\s*:\s*"[^"]*"\s*}+\s*$', '', t, flags=re.IGNORECASE)
@@ -528,7 +528,7 @@ def _clean_chunk_text(text: str) -> str:
528
  return t.strip()
529
 
530
  # ======================================================================================
531
- # Background task : indexation — VERSION CORRIGÉE
532
  # ======================================================================================
533
 
534
  def run_index_job(job_id: str, req: IndexRequest):
@@ -537,15 +537,18 @@ def run_index_job(job_id: str, req: IndexRequest):
537
  _append_log(job_id, f"Start project={req.project_id} files={len(req.files)} | backends={EMB_BACKEND_ORDER} | store={VECTOR_STORE} (deterministic_ids={QDRANT_DETERMINISTIC_IDS}, mode={QDRANT_ID_MODE})")
538
  LOG.info(f"[{job_id}] Index start project={req.project_id} files={len(req.files)}")
539
 
 
 
 
540
  # --- DEBUG DIAGNOSTIC (INSÈRE ICI) ---
541
  try:
542
  N_SAMPLE = 6
543
  sample = req.files[:N_SAMPLE]
544
  seen_hashes = {}
545
  for fidx, fi in enumerate(sample, 1):
546
- p = getattr(fi, "path", "") or ""
547
- t = getattr(fi, "text", "") or ""
548
- h = hashlib.blake2b(t.encode("utf-8", "ignore"), digest_size=8).hexdigest()
549
  seen_hashes.setdefault(h, []).append(p)
550
  LOG.info(f"[{job_id}] recv file #{fidx}: path={p!r} len_text={len(t)} hash8={h} preview={repr(t[:120])}")
551
  if len(req.files) > N_SAMPLE:
@@ -570,7 +573,7 @@ def run_index_job(job_id: str, req: IndexRequest):
570
  # --- WARMUP: calculer un embedding de test pour déterminer la dimension (dim) ---
571
  # On prend un chunk de départ (ou une string 'warmup' si pas de fichiers)
572
  if req.files:
573
- warm_text = next(_chunk_with_spans(req.files[0].text or "", req.chunk_size, req.overlap))[2]
574
  else:
575
  warm_text = "warmup"
576
  try:
@@ -647,10 +650,19 @@ def run_index_job(job_id: str, req: IndexRequest):
647
  if skipped > 0:
648
  _append_log(job_id, f"Dedup intra-batch: skipped {skipped} duplicates")
649
 
650
- vecs, sz = _post_embeddings(buf_chunks, job_id=job_id)
 
 
 
 
 
 
 
 
651
  added = STORE.upsert(col, vecs, buf_metas)
652
  total_chunks += added
653
  _append_log(job_id, f"+{added} chunks (total={total_chunks}) ~{(sz/1024.0):.1f}KiB")
 
654
  buf_chunks, buf_metas = [], []
655
 
656
  # ✅ Filtre des fichiers pertinents
@@ -658,26 +670,34 @@ def run_index_job(job_id: str, req: IndexRequest):
658
  IGNORE_PREFIXES = {".git", "__pycache__", ".vscode", ".idea", "node_modules", "build", "dist", "venv", ".env", ".log", ".tmp"}
659
 
660
  for fi, f in enumerate(req.files, 1):
661
- path = f.path.strip()
 
 
 
 
 
662
  if not path:
663
- continue
 
 
664
 
665
- # ✅ Ignorer les fichiers non textuels ou dans des dossiers ignorés
666
  if any(path.startswith(p) for p in IGNORE_PREFIXES):
667
  _append_log(job_id, f"📁 Ignored: {path} (dossier ignoré)")
668
  continue
669
 
670
  ext = os.path.splitext(path)[1].lower()
671
- text = f.text or ""
672
- if len(text.strip()) < 50: # Ignorer les fichiers trop courts
673
- _append_log(job_id, f"📄 Ignored: {path} (texte trop court: {len(text.strip())} chars)")
 
 
 
674
  continue
675
 
676
  _append_log(job_id, f"📄 Processing: {path} ({len(text)} chars)")
677
 
678
  # --- traitement spécial JSON / NDJSON ---
679
  if ext in {".json"} or path.lower().endswith(".ndjson"):
680
- # essayer JSON complet
681
  handled = False
682
  try:
683
  parsed = json.loads(text)
@@ -691,6 +711,12 @@ def run_index_job(job_id: str, req: IndexRequest):
691
  s = _clean_chunk_text(s)
692
  if len(s) < 30:
693
  continue
 
 
 
 
 
 
694
  meta = {"path": path, "chunk": idx, "start": 0, "end": len(s)}
695
  if req.store_text:
696
  meta["text"] = s
@@ -702,17 +728,20 @@ def run_index_job(job_id: str, req: IndexRequest):
702
  s = " ".join(str(v) for v in parsed.values() if isinstance(v, (str, int, float)))
703
  s = _clean_chunk_text(s)
704
  if len(s) >= 30:
705
- meta = {"path": path, "chunk": 0, "start": 0, "end": len(s)}
706
- if req.store_text:
707
- meta["text"] = s
708
- buf_chunks.append(s); buf_metas.append(meta)
709
- if len(buf_chunks) >= req.batch_size:
710
- _flush()
 
 
 
711
  handled = True
712
  except Exception:
713
- # fallback NDJSON: une ligne == un JSON
714
  try:
715
- lines = [L for L in text.splitlines() if L.strip()]
716
  for li, line in enumerate(lines):
717
  try:
718
  obj = json.loads(line)
@@ -723,6 +752,10 @@ def run_index_job(job_id: str, req: IndexRequest):
723
  s = _clean_chunk_text(s)
724
  if len(s) < 30:
725
  continue
 
 
 
 
726
  meta = {"path": path, "chunk": li, "start": 0, "end": len(s)}
727
  if req.store_text:
728
  meta["text"] = s
@@ -731,9 +764,13 @@ def run_index_job(job_id: str, req: IndexRequest):
731
  _flush()
732
  except Exception:
733
  # ligne non JSON -> indexer comme texte si longue
734
- sl = line.strip()
735
  if len(sl) >= 30:
736
  sl = _clean_chunk_text(sl)
 
 
 
 
737
  meta = {"path": path, "chunk": li, "start": 0, "end": len(sl)}
738
  if req.store_text:
739
  meta["text"] = sl
@@ -750,8 +787,8 @@ def run_index_job(job_id: str, req: IndexRequest):
750
  continue # passe au fichier suivant
751
 
752
  # --- traitement normal pour fichiers texte ---
753
- for ci, (start, end, chunk_txt) in enumerate(_chunk_with_spans(text, req.chunk_size, req.overlap)):
754
- chunk_txt = chunk_txt.strip()
755
  if len(chunk_txt) < 30: # ✅ Ignorer les chunks trop courts
756
  continue
757
  # nettoyage pour éviter artefacts JSON / timestamps
@@ -759,6 +796,12 @@ def run_index_job(job_id: str, req: IndexRequest):
759
  if len(chunk_txt) < 30:
760
  continue
761
 
 
 
 
 
 
 
762
  buf_chunks.append(chunk_txt)
763
  meta = {
764
  "path": path,
 
275
  # ======================================================================================
276
 
277
  class FileIn(BaseModel):
278
+ path: Optional[str] = "" # tolérancemajeure: accepte None
279
+ text: Optional[str] = "" # idem
280
 
281
  class IndexRequest(BaseModel):
282
  project_id: str = Field(..., min_length=1)
 
495
  """
496
  if not text:
497
  return text
498
+ t = (text or "").strip()
499
 
500
  # retirer un suffixe typique: , "indexed_at": "2025-..."}}
501
  t = re.sub(r',\s*"indexed_at"\s*:\s*"[^"]*"\s*}+\s*$', '', t, flags=re.IGNORECASE)
 
528
  return t.strip()
529
 
530
  # ======================================================================================
531
+ # Background task : indexation — VERSION CORRIGÉE (ajouts anti-dup & robustesse)
532
  # ======================================================================================
533
 
534
  def run_index_job(job_id: str, req: IndexRequest):
 
537
  _append_log(job_id, f"Start project={req.project_id} files={len(req.files)} | backends={EMB_BACKEND_ORDER} | store={VECTOR_STORE} (deterministic_ids={QDRANT_DETERMINISTIC_IDS}, mode={QDRANT_ID_MODE})")
538
  LOG.info(f"[{job_id}] Index start project={req.project_id} files={len(req.files)}")
539
 
540
+ # ensemble de hashes de chunks déjà vus dans ce job (dédup intra-job)
541
+ seen_chunk_hashes = set()
542
+
543
  # --- DEBUG DIAGNOSTIC (INSÈRE ICI) ---
544
  try:
545
  N_SAMPLE = 6
546
  sample = req.files[:N_SAMPLE]
547
  seen_hashes = {}
548
  for fidx, fi in enumerate(sample, 1):
549
+ p = (getattr(fi, "path", "") or "") or ""
550
+ t = (getattr(fi, "text", "") or "") or ""
551
+ h = hashlib.blake2b((t or "").encode("utf-8", "ignore"), digest_size=8).hexdigest()
552
  seen_hashes.setdefault(h, []).append(p)
553
  LOG.info(f"[{job_id}] recv file #{fidx}: path={p!r} len_text={len(t)} hash8={h} preview={repr(t[:120])}")
554
  if len(req.files) > N_SAMPLE:
 
573
  # --- WARMUP: calculer un embedding de test pour déterminer la dimension (dim) ---
574
  # On prend un chunk de départ (ou une string 'warmup' si pas de fichiers)
575
  if req.files:
576
+ warm_text = next(_chunk_with_spans((req.files[0].text or "") , req.chunk_size, req.overlap))[2]
577
  else:
578
  warm_text = "warmup"
579
  try:
 
650
  if skipped > 0:
651
  _append_log(job_id, f"Dedup intra-batch: skipped {skipped} duplicates")
652
 
653
+ try:
654
+ vecs, sz = _post_embeddings(buf_chunks, job_id=job_id)
655
+ except Exception as e:
656
+ # échec -> journaliser et faire échouer le job proprement (on ne vide pas le buffer pour debug mais on arrête)
657
+ LOG.exception("[%s] Embeddings failed during flush: %s", job_id, e)
658
+ _append_log(job_id, f"Embeddings failed during flush: {e}")
659
+ _set_status(job_id, "error")
660
+ raise
661
+
662
  added = STORE.upsert(col, vecs, buf_metas)
663
  total_chunks += added
664
  _append_log(job_id, f"+{added} chunks (total={total_chunks}) ~{(sz/1024.0):.1f}KiB")
665
+ # vider buffers ONLY après succès
666
  buf_chunks, buf_metas = [], []
667
 
668
  # ✅ Filtre des fichiers pertinents
 
670
  IGNORE_PREFIXES = {".git", "__pycache__", ".vscode", ".idea", "node_modules", "build", "dist", "venv", ".env", ".log", ".tmp"}
671
 
672
  for fi, f in enumerate(req.files, 1):
673
+ # defensive: path/text peuvent être None -> utiliser fallback
674
+ path_raw = (getattr(f, "path", "") or "") # peut être None
675
+ path = (path_raw or "").strip()
676
+ text_raw = (getattr(f, "text", "") or "")
677
+ text = text_raw or ""
678
+
679
  if not path:
680
+ # fallback path stable basé sur hash du texte (pour éviter collisions None)
681
+ h8 = hashlib.blake2b((text or "").encode("utf-8", "ignore"), digest_size=8).hexdigest()
682
+ path = f"__no_path__{h8}"
683
 
 
684
  if any(path.startswith(p) for p in IGNORE_PREFIXES):
685
  _append_log(job_id, f"📁 Ignored: {path} (dossier ignoré)")
686
  continue
687
 
688
  ext = os.path.splitext(path)[1].lower()
689
+ if ext not in TEXT_EXTS:
690
+ _append_log(job_id, f"📁 Ignored: {path} (extension non supportée: {ext})")
691
+ continue
692
+
693
+ if len((text or "").strip()) < 50: # ✅ Ignorer les fichiers trop courts
694
+ _append_log(job_id, f"📄 Ignored: {path} (texte trop court: {len((text or '').strip())} chars)")
695
  continue
696
 
697
  _append_log(job_id, f"📄 Processing: {path} ({len(text)} chars)")
698
 
699
  # --- traitement spécial JSON / NDJSON ---
700
  if ext in {".json"} or path.lower().endswith(".ndjson"):
 
701
  handled = False
702
  try:
703
  parsed = json.loads(text)
 
711
  s = _clean_chunk_text(s)
712
  if len(s) < 30:
713
  continue
714
+ # dedup global intra-job
715
+ chash = hashlib.blake2b(s.encode("utf-8", "ignore"), digest_size=8).hexdigest()
716
+ if chash in seen_chunk_hashes:
717
+ continue
718
+ seen_chunk_hashes.add(chash)
719
+
720
  meta = {"path": path, "chunk": idx, "start": 0, "end": len(s)}
721
  if req.store_text:
722
  meta["text"] = s
 
728
  s = " ".join(str(v) for v in parsed.values() if isinstance(v, (str, int, float)))
729
  s = _clean_chunk_text(s)
730
  if len(s) >= 30:
731
+ chash = hashlib.blake2b(s.encode("utf-8", "ignore"), digest_size=8).hexdigest()
732
+ if chash not in seen_chunk_hashes:
733
+ seen_chunk_hashes.add(chash)
734
+ meta = {"path": path, "chunk": 0, "start": 0, "end": len(s)}
735
+ if req.store_text:
736
+ meta["text"] = s
737
+ buf_chunks.append(s); buf_metas.append(meta)
738
+ if len(buf_chunks) >= req.batch_size:
739
+ _flush()
740
  handled = True
741
  except Exception:
742
+ # fallback NDJSON: une ligne == un JSON ou texte
743
  try:
744
+ lines = [L for L in (text or "").splitlines() if L.strip()]
745
  for li, line in enumerate(lines):
746
  try:
747
  obj = json.loads(line)
 
752
  s = _clean_chunk_text(s)
753
  if len(s) < 30:
754
  continue
755
+ chash = hashlib.blake2b(s.encode("utf-8", "ignore"), digest_size=8).hexdigest()
756
+ if chash in seen_chunk_hashes:
757
+ continue
758
+ seen_chunk_hashes.add(chash)
759
  meta = {"path": path, "chunk": li, "start": 0, "end": len(s)}
760
  if req.store_text:
761
  meta["text"] = s
 
764
  _flush()
765
  except Exception:
766
  # ligne non JSON -> indexer comme texte si longue
767
+ sl = (line or "").strip()
768
  if len(sl) >= 30:
769
  sl = _clean_chunk_text(sl)
770
+ chash = hashlib.blake2b(sl.encode("utf-8", "ignore"), digest_size=8).hexdigest()
771
+ if chash in seen_chunk_hashes:
772
+ continue
773
+ seen_chunk_hashes.add(chash)
774
  meta = {"path": path, "chunk": li, "start": 0, "end": len(sl)}
775
  if req.store_text:
776
  meta["text"] = sl
 
787
  continue # passe au fichier suivant
788
 
789
  # --- traitement normal pour fichiers texte ---
790
+ for ci, (start, end, chunk_txt) in enumerate(_chunk_with_spans(text or "", req.chunk_size, req.overlap)):
791
+ chunk_txt = (chunk_txt or "").strip()
792
  if len(chunk_txt) < 30: # ✅ Ignorer les chunks trop courts
793
  continue
794
  # nettoyage pour éviter artefacts JSON / timestamps
 
796
  if len(chunk_txt) < 30:
797
  continue
798
 
799
+ # dedup global intra-job (empêche répétitions)
800
+ chash = hashlib.blake2b(chunk_txt.encode("utf-8", "ignore"), digest_size=8).hexdigest()
801
+ if chash in seen_chunk_hashes:
802
+ continue
803
+ seen_chunk_hashes.add(chash)
804
+
805
  buf_chunks.append(chunk_txt)
806
  meta = {
807
  "path": path,