Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
|
@@ -275,8 +275,8 @@ except Exception as e:
|
|
| 275 |
# ======================================================================================
|
| 276 |
|
| 277 |
class FileIn(BaseModel):
|
| 278 |
-
path: str
|
| 279 |
-
text: str
|
| 280 |
|
| 281 |
class IndexRequest(BaseModel):
|
| 282 |
project_id: str = Field(..., min_length=1)
|
|
@@ -495,7 +495,7 @@ def _clean_chunk_text(text: str) -> str:
|
|
| 495 |
"""
|
| 496 |
if not text:
|
| 497 |
return text
|
| 498 |
-
t = text.strip()
|
| 499 |
|
| 500 |
# retirer un suffixe typique: , "indexed_at": "2025-..."}}
|
| 501 |
t = re.sub(r',\s*"indexed_at"\s*:\s*"[^"]*"\s*}+\s*$', '', t, flags=re.IGNORECASE)
|
|
@@ -528,7 +528,7 @@ def _clean_chunk_text(text: str) -> str:
|
|
| 528 |
return t.strip()
|
| 529 |
|
| 530 |
# ======================================================================================
|
| 531 |
-
# Background task : indexation — VERSION CORRIGÉE
|
| 532 |
# ======================================================================================
|
| 533 |
|
| 534 |
def run_index_job(job_id: str, req: IndexRequest):
|
|
@@ -537,15 +537,18 @@ def run_index_job(job_id: str, req: IndexRequest):
|
|
| 537 |
_append_log(job_id, f"Start project={req.project_id} files={len(req.files)} | backends={EMB_BACKEND_ORDER} | store={VECTOR_STORE} (deterministic_ids={QDRANT_DETERMINISTIC_IDS}, mode={QDRANT_ID_MODE})")
|
| 538 |
LOG.info(f"[{job_id}] Index start project={req.project_id} files={len(req.files)}")
|
| 539 |
|
|
|
|
|
|
|
|
|
|
| 540 |
# --- DEBUG DIAGNOSTIC (INSÈRE ICI) ---
|
| 541 |
try:
|
| 542 |
N_SAMPLE = 6
|
| 543 |
sample = req.files[:N_SAMPLE]
|
| 544 |
seen_hashes = {}
|
| 545 |
for fidx, fi in enumerate(sample, 1):
|
| 546 |
-
p = getattr(fi, "path", "") or ""
|
| 547 |
-
t = getattr(fi, "text", "") or ""
|
| 548 |
-
h = hashlib.blake2b(t.encode("utf-8", "ignore"), digest_size=8).hexdigest()
|
| 549 |
seen_hashes.setdefault(h, []).append(p)
|
| 550 |
LOG.info(f"[{job_id}] recv file #{fidx}: path={p!r} len_text={len(t)} hash8={h} preview={repr(t[:120])}")
|
| 551 |
if len(req.files) > N_SAMPLE:
|
|
@@ -570,7 +573,7 @@ def run_index_job(job_id: str, req: IndexRequest):
|
|
| 570 |
# --- WARMUP: calculer un embedding de test pour déterminer la dimension (dim) ---
|
| 571 |
# On prend un chunk de départ (ou une string 'warmup' si pas de fichiers)
|
| 572 |
if req.files:
|
| 573 |
-
warm_text = next(_chunk_with_spans(req.files[0].text or "", req.chunk_size, req.overlap))[2]
|
| 574 |
else:
|
| 575 |
warm_text = "warmup"
|
| 576 |
try:
|
|
@@ -647,10 +650,19 @@ def run_index_job(job_id: str, req: IndexRequest):
|
|
| 647 |
if skipped > 0:
|
| 648 |
_append_log(job_id, f"Dedup intra-batch: skipped {skipped} duplicates")
|
| 649 |
|
| 650 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 651 |
added = STORE.upsert(col, vecs, buf_metas)
|
| 652 |
total_chunks += added
|
| 653 |
_append_log(job_id, f"+{added} chunks (total={total_chunks}) ~{(sz/1024.0):.1f}KiB")
|
|
|
|
| 654 |
buf_chunks, buf_metas = [], []
|
| 655 |
|
| 656 |
# ✅ Filtre des fichiers pertinents
|
|
@@ -658,26 +670,34 @@ def run_index_job(job_id: str, req: IndexRequest):
|
|
| 658 |
IGNORE_PREFIXES = {".git", "__pycache__", ".vscode", ".idea", "node_modules", "build", "dist", "venv", ".env", ".log", ".tmp"}
|
| 659 |
|
| 660 |
for fi, f in enumerate(req.files, 1):
|
| 661 |
-
path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 662 |
if not path:
|
| 663 |
-
|
|
|
|
|
|
|
| 664 |
|
| 665 |
-
# ✅ Ignorer les fichiers non textuels ou dans des dossiers ignorés
|
| 666 |
if any(path.startswith(p) for p in IGNORE_PREFIXES):
|
| 667 |
_append_log(job_id, f"📁 Ignored: {path} (dossier ignoré)")
|
| 668 |
continue
|
| 669 |
|
| 670 |
ext = os.path.splitext(path)[1].lower()
|
| 671 |
-
|
| 672 |
-
|
| 673 |
-
|
|
|
|
|
|
|
|
|
|
| 674 |
continue
|
| 675 |
|
| 676 |
_append_log(job_id, f"📄 Processing: {path} ({len(text)} chars)")
|
| 677 |
|
| 678 |
# --- traitement spécial JSON / NDJSON ---
|
| 679 |
if ext in {".json"} or path.lower().endswith(".ndjson"):
|
| 680 |
-
# essayer JSON complet
|
| 681 |
handled = False
|
| 682 |
try:
|
| 683 |
parsed = json.loads(text)
|
|
@@ -691,6 +711,12 @@ def run_index_job(job_id: str, req: IndexRequest):
|
|
| 691 |
s = _clean_chunk_text(s)
|
| 692 |
if len(s) < 30:
|
| 693 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 694 |
meta = {"path": path, "chunk": idx, "start": 0, "end": len(s)}
|
| 695 |
if req.store_text:
|
| 696 |
meta["text"] = s
|
|
@@ -702,17 +728,20 @@ def run_index_job(job_id: str, req: IndexRequest):
|
|
| 702 |
s = " ".join(str(v) for v in parsed.values() if isinstance(v, (str, int, float)))
|
| 703 |
s = _clean_chunk_text(s)
|
| 704 |
if len(s) >= 30:
|
| 705 |
-
|
| 706 |
-
if
|
| 707 |
-
|
| 708 |
-
|
| 709 |
-
|
| 710 |
-
|
|
|
|
|
|
|
|
|
|
| 711 |
handled = True
|
| 712 |
except Exception:
|
| 713 |
-
# fallback NDJSON: une ligne == un JSON
|
| 714 |
try:
|
| 715 |
-
lines = [L for L in text.splitlines() if L.strip()]
|
| 716 |
for li, line in enumerate(lines):
|
| 717 |
try:
|
| 718 |
obj = json.loads(line)
|
|
@@ -723,6 +752,10 @@ def run_index_job(job_id: str, req: IndexRequest):
|
|
| 723 |
s = _clean_chunk_text(s)
|
| 724 |
if len(s) < 30:
|
| 725 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
| 726 |
meta = {"path": path, "chunk": li, "start": 0, "end": len(s)}
|
| 727 |
if req.store_text:
|
| 728 |
meta["text"] = s
|
|
@@ -731,9 +764,13 @@ def run_index_job(job_id: str, req: IndexRequest):
|
|
| 731 |
_flush()
|
| 732 |
except Exception:
|
| 733 |
# ligne non JSON -> indexer comme texte si longue
|
| 734 |
-
sl = line.strip()
|
| 735 |
if len(sl) >= 30:
|
| 736 |
sl = _clean_chunk_text(sl)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 737 |
meta = {"path": path, "chunk": li, "start": 0, "end": len(sl)}
|
| 738 |
if req.store_text:
|
| 739 |
meta["text"] = sl
|
|
@@ -750,8 +787,8 @@ def run_index_job(job_id: str, req: IndexRequest):
|
|
| 750 |
continue # passe au fichier suivant
|
| 751 |
|
| 752 |
# --- traitement normal pour fichiers texte ---
|
| 753 |
-
for ci, (start, end, chunk_txt) in enumerate(_chunk_with_spans(text, req.chunk_size, req.overlap)):
|
| 754 |
-
chunk_txt = chunk_txt.strip()
|
| 755 |
if len(chunk_txt) < 30: # ✅ Ignorer les chunks trop courts
|
| 756 |
continue
|
| 757 |
# nettoyage pour éviter artefacts JSON / timestamps
|
|
@@ -759,6 +796,12 @@ def run_index_job(job_id: str, req: IndexRequest):
|
|
| 759 |
if len(chunk_txt) < 30:
|
| 760 |
continue
|
| 761 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 762 |
buf_chunks.append(chunk_txt)
|
| 763 |
meta = {
|
| 764 |
"path": path,
|
|
|
|
| 275 |
# ======================================================================================
|
| 276 |
|
| 277 |
class FileIn(BaseModel):
|
| 278 |
+
path: Optional[str] = "" # tolérancemajeure: accepte None
|
| 279 |
+
text: Optional[str] = "" # idem
|
| 280 |
|
| 281 |
class IndexRequest(BaseModel):
|
| 282 |
project_id: str = Field(..., min_length=1)
|
|
|
|
| 495 |
"""
|
| 496 |
if not text:
|
| 497 |
return text
|
| 498 |
+
t = (text or "").strip()
|
| 499 |
|
| 500 |
# retirer un suffixe typique: , "indexed_at": "2025-..."}}
|
| 501 |
t = re.sub(r',\s*"indexed_at"\s*:\s*"[^"]*"\s*}+\s*$', '', t, flags=re.IGNORECASE)
|
|
|
|
| 528 |
return t.strip()
|
| 529 |
|
| 530 |
# ======================================================================================
|
| 531 |
+
# Background task : indexation — VERSION CORRIGÉE (ajouts anti-dup & robustesse)
|
| 532 |
# ======================================================================================
|
| 533 |
|
| 534 |
def run_index_job(job_id: str, req: IndexRequest):
|
|
|
|
| 537 |
_append_log(job_id, f"Start project={req.project_id} files={len(req.files)} | backends={EMB_BACKEND_ORDER} | store={VECTOR_STORE} (deterministic_ids={QDRANT_DETERMINISTIC_IDS}, mode={QDRANT_ID_MODE})")
|
| 538 |
LOG.info(f"[{job_id}] Index start project={req.project_id} files={len(req.files)}")
|
| 539 |
|
| 540 |
+
# ensemble de hashes de chunks déjà vus dans ce job (dédup intra-job)
|
| 541 |
+
seen_chunk_hashes = set()
|
| 542 |
+
|
| 543 |
# --- DEBUG DIAGNOSTIC (INSÈRE ICI) ---
|
| 544 |
try:
|
| 545 |
N_SAMPLE = 6
|
| 546 |
sample = req.files[:N_SAMPLE]
|
| 547 |
seen_hashes = {}
|
| 548 |
for fidx, fi in enumerate(sample, 1):
|
| 549 |
+
p = (getattr(fi, "path", "") or "") or ""
|
| 550 |
+
t = (getattr(fi, "text", "") or "") or ""
|
| 551 |
+
h = hashlib.blake2b((t or "").encode("utf-8", "ignore"), digest_size=8).hexdigest()
|
| 552 |
seen_hashes.setdefault(h, []).append(p)
|
| 553 |
LOG.info(f"[{job_id}] recv file #{fidx}: path={p!r} len_text={len(t)} hash8={h} preview={repr(t[:120])}")
|
| 554 |
if len(req.files) > N_SAMPLE:
|
|
|
|
| 573 |
# --- WARMUP: calculer un embedding de test pour déterminer la dimension (dim) ---
|
| 574 |
# On prend un chunk de départ (ou une string 'warmup' si pas de fichiers)
|
| 575 |
if req.files:
|
| 576 |
+
warm_text = next(_chunk_with_spans((req.files[0].text or "") , req.chunk_size, req.overlap))[2]
|
| 577 |
else:
|
| 578 |
warm_text = "warmup"
|
| 579 |
try:
|
|
|
|
| 650 |
if skipped > 0:
|
| 651 |
_append_log(job_id, f"Dedup intra-batch: skipped {skipped} duplicates")
|
| 652 |
|
| 653 |
+
try:
|
| 654 |
+
vecs, sz = _post_embeddings(buf_chunks, job_id=job_id)
|
| 655 |
+
except Exception as e:
|
| 656 |
+
# échec -> journaliser et faire échouer le job proprement (on ne vide pas le buffer pour debug mais on arrête)
|
| 657 |
+
LOG.exception("[%s] Embeddings failed during flush: %s", job_id, e)
|
| 658 |
+
_append_log(job_id, f"Embeddings failed during flush: {e}")
|
| 659 |
+
_set_status(job_id, "error")
|
| 660 |
+
raise
|
| 661 |
+
|
| 662 |
added = STORE.upsert(col, vecs, buf_metas)
|
| 663 |
total_chunks += added
|
| 664 |
_append_log(job_id, f"+{added} chunks (total={total_chunks}) ~{(sz/1024.0):.1f}KiB")
|
| 665 |
+
# vider buffers ONLY après succès
|
| 666 |
buf_chunks, buf_metas = [], []
|
| 667 |
|
| 668 |
# ✅ Filtre des fichiers pertinents
|
|
|
|
| 670 |
IGNORE_PREFIXES = {".git", "__pycache__", ".vscode", ".idea", "node_modules", "build", "dist", "venv", ".env", ".log", ".tmp"}
|
| 671 |
|
| 672 |
for fi, f in enumerate(req.files, 1):
|
| 673 |
+
# defensive: path/text peuvent être None -> utiliser fallback
|
| 674 |
+
path_raw = (getattr(f, "path", "") or "") # peut être None
|
| 675 |
+
path = (path_raw or "").strip()
|
| 676 |
+
text_raw = (getattr(f, "text", "") or "")
|
| 677 |
+
text = text_raw or ""
|
| 678 |
+
|
| 679 |
if not path:
|
| 680 |
+
# fallback path stable basé sur hash du texte (pour éviter collisions None)
|
| 681 |
+
h8 = hashlib.blake2b((text or "").encode("utf-8", "ignore"), digest_size=8).hexdigest()
|
| 682 |
+
path = f"__no_path__{h8}"
|
| 683 |
|
|
|
|
| 684 |
if any(path.startswith(p) for p in IGNORE_PREFIXES):
|
| 685 |
_append_log(job_id, f"📁 Ignored: {path} (dossier ignoré)")
|
| 686 |
continue
|
| 687 |
|
| 688 |
ext = os.path.splitext(path)[1].lower()
|
| 689 |
+
if ext not in TEXT_EXTS:
|
| 690 |
+
_append_log(job_id, f"📁 Ignored: {path} (extension non supportée: {ext})")
|
| 691 |
+
continue
|
| 692 |
+
|
| 693 |
+
if len((text or "").strip()) < 50: # ✅ Ignorer les fichiers trop courts
|
| 694 |
+
_append_log(job_id, f"📄 Ignored: {path} (texte trop court: {len((text or '').strip())} chars)")
|
| 695 |
continue
|
| 696 |
|
| 697 |
_append_log(job_id, f"📄 Processing: {path} ({len(text)} chars)")
|
| 698 |
|
| 699 |
# --- traitement spécial JSON / NDJSON ---
|
| 700 |
if ext in {".json"} or path.lower().endswith(".ndjson"):
|
|
|
|
| 701 |
handled = False
|
| 702 |
try:
|
| 703 |
parsed = json.loads(text)
|
|
|
|
| 711 |
s = _clean_chunk_text(s)
|
| 712 |
if len(s) < 30:
|
| 713 |
continue
|
| 714 |
+
# dedup global intra-job
|
| 715 |
+
chash = hashlib.blake2b(s.encode("utf-8", "ignore"), digest_size=8).hexdigest()
|
| 716 |
+
if chash in seen_chunk_hashes:
|
| 717 |
+
continue
|
| 718 |
+
seen_chunk_hashes.add(chash)
|
| 719 |
+
|
| 720 |
meta = {"path": path, "chunk": idx, "start": 0, "end": len(s)}
|
| 721 |
if req.store_text:
|
| 722 |
meta["text"] = s
|
|
|
|
| 728 |
s = " ".join(str(v) for v in parsed.values() if isinstance(v, (str, int, float)))
|
| 729 |
s = _clean_chunk_text(s)
|
| 730 |
if len(s) >= 30:
|
| 731 |
+
chash = hashlib.blake2b(s.encode("utf-8", "ignore"), digest_size=8).hexdigest()
|
| 732 |
+
if chash not in seen_chunk_hashes:
|
| 733 |
+
seen_chunk_hashes.add(chash)
|
| 734 |
+
meta = {"path": path, "chunk": 0, "start": 0, "end": len(s)}
|
| 735 |
+
if req.store_text:
|
| 736 |
+
meta["text"] = s
|
| 737 |
+
buf_chunks.append(s); buf_metas.append(meta)
|
| 738 |
+
if len(buf_chunks) >= req.batch_size:
|
| 739 |
+
_flush()
|
| 740 |
handled = True
|
| 741 |
except Exception:
|
| 742 |
+
# fallback NDJSON: une ligne == un JSON ou texte
|
| 743 |
try:
|
| 744 |
+
lines = [L for L in (text or "").splitlines() if L.strip()]
|
| 745 |
for li, line in enumerate(lines):
|
| 746 |
try:
|
| 747 |
obj = json.loads(line)
|
|
|
|
| 752 |
s = _clean_chunk_text(s)
|
| 753 |
if len(s) < 30:
|
| 754 |
continue
|
| 755 |
+
chash = hashlib.blake2b(s.encode("utf-8", "ignore"), digest_size=8).hexdigest()
|
| 756 |
+
if chash in seen_chunk_hashes:
|
| 757 |
+
continue
|
| 758 |
+
seen_chunk_hashes.add(chash)
|
| 759 |
meta = {"path": path, "chunk": li, "start": 0, "end": len(s)}
|
| 760 |
if req.store_text:
|
| 761 |
meta["text"] = s
|
|
|
|
| 764 |
_flush()
|
| 765 |
except Exception:
|
| 766 |
# ligne non JSON -> indexer comme texte si longue
|
| 767 |
+
sl = (line or "").strip()
|
| 768 |
if len(sl) >= 30:
|
| 769 |
sl = _clean_chunk_text(sl)
|
| 770 |
+
chash = hashlib.blake2b(sl.encode("utf-8", "ignore"), digest_size=8).hexdigest()
|
| 771 |
+
if chash in seen_chunk_hashes:
|
| 772 |
+
continue
|
| 773 |
+
seen_chunk_hashes.add(chash)
|
| 774 |
meta = {"path": path, "chunk": li, "start": 0, "end": len(sl)}
|
| 775 |
if req.store_text:
|
| 776 |
meta["text"] = sl
|
|
|
|
| 787 |
continue # passe au fichier suivant
|
| 788 |
|
| 789 |
# --- traitement normal pour fichiers texte ---
|
| 790 |
+
for ci, (start, end, chunk_txt) in enumerate(_chunk_with_spans(text or "", req.chunk_size, req.overlap)):
|
| 791 |
+
chunk_txt = (chunk_txt or "").strip()
|
| 792 |
if len(chunk_txt) < 30: # ✅ Ignorer les chunks trop courts
|
| 793 |
continue
|
| 794 |
# nettoyage pour éviter artefacts JSON / timestamps
|
|
|
|
| 796 |
if len(chunk_txt) < 30:
|
| 797 |
continue
|
| 798 |
|
| 799 |
+
# dedup global intra-job (empêche répétitions)
|
| 800 |
+
chash = hashlib.blake2b(chunk_txt.encode("utf-8", "ignore"), digest_size=8).hexdigest()
|
| 801 |
+
if chash in seen_chunk_hashes:
|
| 802 |
+
continue
|
| 803 |
+
seen_chunk_hashes.add(chash)
|
| 804 |
+
|
| 805 |
buf_chunks.append(chunk_txt)
|
| 806 |
meta = {
|
| 807 |
"path": path,
|