Spaces:

chouchouvs
/

DeepIndex

Running

App Files Files Community

chouchouvs commited on Sep 16

Commit

849d996

verified ·

1 Parent(s): 75159bc

Update main.py

Browse files

Files changed (1) hide show

main.py +70 -27

main.py CHANGED Viewed

@@ -275,8 +275,8 @@ except Exception as e:
 # ======================================================================================
 class FileIn(BaseModel):
-    path: str
-    text: str
 class IndexRequest(BaseModel):
     project_id: str = Field(..., min_length=1)
@@ -495,7 +495,7 @@ def _clean_chunk_text(text: str) -> str:
     """
     if not text:
         return text
-    t = text.strip()
     # retirer un suffixe typique: , "indexed_at": "2025-..."}}
     t = re.sub(r',\s*"indexed_at"\s*:\s*"[^"]*"\s*}+\s*$', '', t, flags=re.IGNORECASE)
@@ -528,7 +528,7 @@ def _clean_chunk_text(text: str) -> str:
     return t.strip()
 # ======================================================================================
-# Background task : indexation — VERSION CORRIGÉE
 # ======================================================================================
 def run_index_job(job_id: str, req: IndexRequest):
@@ -537,15 +537,18 @@ def run_index_job(job_id: str, req: IndexRequest):
         _append_log(job_id, f"Start project={req.project_id} files={len(req.files)} | backends={EMB_BACKEND_ORDER} | store={VECTOR_STORE} (deterministic_ids={QDRANT_DETERMINISTIC_IDS}, mode={QDRANT_ID_MODE})")
         LOG.info(f"[{job_id}] Index start project={req.project_id} files={len(req.files)}")
         # --- DEBUG DIAGNOSTIC (INSÈRE ICI) ---
         try:
             N_SAMPLE = 6
             sample = req.files[:N_SAMPLE]
             seen_hashes = {}
             for fidx, fi in enumerate(sample, 1):
-                p = getattr(fi, "path", "") or ""
-                t = getattr(fi, "text", "") or ""
-                h = hashlib.blake2b(t.encode("utf-8", "ignore"), digest_size=8).hexdigest()
                 seen_hashes.setdefault(h, []).append(p)
                 LOG.info(f"[{job_id}] recv file #{fidx}: path={p!r} len_text={len(t)} hash8={h} preview={repr(t[:120])}")
             if len(req.files) > N_SAMPLE:
@@ -570,7 +573,7 @@ def run_index_job(job_id: str, req: IndexRequest):
         # --- WARMUP: calculer un embedding de test pour déterminer la dimension (dim) ---
         # On prend un chunk de départ (ou une string 'warmup' si pas de fichiers)
         if req.files:
-            warm_text = next(_chunk_with_spans(req.files[0].text or "", req.chunk_size, req.overlap))[2]
         else:
             warm_text = "warmup"
         try:
@@ -647,10 +650,19 @@ def run_index_job(job_id: str, req: IndexRequest):
                 if skipped > 0:
                     _append_log(job_id, f"Dedup intra-batch: skipped {skipped} duplicates")
-            vecs, sz = _post_embeddings(buf_chunks, job_id=job_id)
             added = STORE.upsert(col, vecs, buf_metas)
             total_chunks += added
             _append_log(job_id, f"+{added} chunks (total={total_chunks}) ~{(sz/1024.0):.1f}KiB")
             buf_chunks, buf_metas = [], []
         # ✅ Filtre des fichiers pertinents
@@ -658,26 +670,34 @@ def run_index_job(job_id: str, req: IndexRequest):
         IGNORE_PREFIXES = {".git", "__pycache__", ".vscode", ".idea", "node_modules", "build", "dist", "venv", ".env", ".log", ".tmp"}
         for fi, f in enumerate(req.files, 1):
-            path = f.path.strip()
             if not path:
-                continue
-            # ✅ Ignorer les fichiers non textuels ou dans des dossiers ignorés
             if any(path.startswith(p) for p in IGNORE_PREFIXES):
                 _append_log(job_id, f"📁 Ignored: {path} (dossier ignoré)")
                 continue
             ext = os.path.splitext(path)[1].lower()
-            text = f.text or ""
-            if len(text.strip()) < 50:  # ✅ Ignorer les fichiers trop courts
-                _append_log(job_id, f"📄 Ignored: {path} (texte trop court: {len(text.strip())} chars)")
                 continue
             _append_log(job_id, f"📄 Processing: {path} ({len(text)} chars)")
             # --- traitement spécial JSON / NDJSON ---
             if ext in {".json"} or path.lower().endswith(".ndjson"):
-                # essayer JSON complet
                 handled = False
                 try:
                     parsed = json.loads(text)
@@ -691,6 +711,12 @@ def run_index_job(job_id: str, req: IndexRequest):
                             s = _clean_chunk_text(s)
                             if len(s) < 30:
                                 continue
                             meta = {"path": path, "chunk": idx, "start": 0, "end": len(s)}
                             if req.store_text:
                                 meta["text"] = s
@@ -702,17 +728,20 @@ def run_index_job(job_id: str, req: IndexRequest):
                         s = " ".join(str(v) for v in parsed.values() if isinstance(v, (str, int, float)))
                         s = _clean_chunk_text(s)
                         if len(s) >= 30:
-                            meta = {"path": path, "chunk": 0, "start": 0, "end": len(s)}
-                            if req.store_text:
-                                meta["text"] = s
-                            buf_chunks.append(s); buf_metas.append(meta)
-                            if len(buf_chunks) >= req.batch_size:
-                                _flush()
                         handled = True
                 except Exception:
-                    # fallback NDJSON: une ligne == un JSON
                     try:
-                        lines = [L for L in text.splitlines() if L.strip()]
                         for li, line in enumerate(lines):
                             try:
                                 obj = json.loads(line)
@@ -723,6 +752,10 @@ def run_index_job(job_id: str, req: IndexRequest):
                                 s = _clean_chunk_text(s)
                                 if len(s) < 30:
                                     continue
                                 meta = {"path": path, "chunk": li, "start": 0, "end": len(s)}
                                 if req.store_text:
                                     meta["text"] = s
@@ -731,9 +764,13 @@ def run_index_job(job_id: str, req: IndexRequest):
                                     _flush()
                             except Exception:
                                 # ligne non JSON -> indexer comme texte si longue
-                                sl = line.strip()
                                 if len(sl) >= 30:
                                     sl = _clean_chunk_text(sl)
                                     meta = {"path": path, "chunk": li, "start": 0, "end": len(sl)}
                                     if req.store_text:
                                         meta["text"] = sl
@@ -750,8 +787,8 @@ def run_index_job(job_id: str, req: IndexRequest):
                     continue  # passe au fichier suivant
             # --- traitement normal pour fichiers texte ---
-            for ci, (start, end, chunk_txt) in enumerate(_chunk_with_spans(text, req.chunk_size, req.overlap)):
-                chunk_txt = chunk_txt.strip()
                 if len(chunk_txt) < 30:  # ✅ Ignorer les chunks trop courts
                     continue
                 # nettoyage pour éviter artefacts JSON / timestamps
@@ -759,6 +796,12 @@ def run_index_job(job_id: str, req: IndexRequest):
                 if len(chunk_txt) < 30:
                     continue
                 buf_chunks.append(chunk_txt)
                 meta = {
                     "path": path,

 # ======================================================================================
 class FileIn(BaseModel):
+    path: Optional[str] = ""   # tolérancemajeure: accepte None
+    text: Optional[str] = ""   # idem
 class IndexRequest(BaseModel):
     project_id: str = Field(..., min_length=1)
     """
     if not text:
         return text
+    t = (text or "").strip()
     # retirer un suffixe typique: , "indexed_at": "2025-..."}}
     t = re.sub(r',\s*"indexed_at"\s*:\s*"[^"]*"\s*}+\s*$', '', t, flags=re.IGNORECASE)
     return t.strip()
 # ======================================================================================
+# Background task : indexation — VERSION CORRIGÉE (ajouts anti-dup & robustesse)
 # ======================================================================================
 def run_index_job(job_id: str, req: IndexRequest):
         _append_log(job_id, f"Start project={req.project_id} files={len(req.files)} | backends={EMB_BACKEND_ORDER} | store={VECTOR_STORE} (deterministic_ids={QDRANT_DETERMINISTIC_IDS}, mode={QDRANT_ID_MODE})")
         LOG.info(f"[{job_id}] Index start project={req.project_id} files={len(req.files)}")
+        # ensemble de hashes de chunks déjà vus dans ce job (dédup intra-job)
+        seen_chunk_hashes = set()
         # --- DEBUG DIAGNOSTIC (INSÈRE ICI) ---
         try:
             N_SAMPLE = 6
             sample = req.files[:N_SAMPLE]
             seen_hashes = {}
             for fidx, fi in enumerate(sample, 1):
+                p = (getattr(fi, "path", "") or "") or ""
+                t = (getattr(fi, "text", "") or "") or ""
+                h = hashlib.blake2b((t or "").encode("utf-8", "ignore"), digest_size=8).hexdigest()
                 seen_hashes.setdefault(h, []).append(p)
                 LOG.info(f"[{job_id}] recv file #{fidx}: path={p!r} len_text={len(t)} hash8={h} preview={repr(t[:120])}")
             if len(req.files) > N_SAMPLE:
         # --- WARMUP: calculer un embedding de test pour déterminer la dimension (dim) ---
         # On prend un chunk de départ (ou une string 'warmup' si pas de fichiers)
         if req.files:
+            warm_text = next(_chunk_with_spans((req.files[0].text or "") , req.chunk_size, req.overlap))[2]
         else:
             warm_text = "warmup"
         try:
                 if skipped > 0:
                     _append_log(job_id, f"Dedup intra-batch: skipped {skipped} duplicates")
+            try:
+                vecs, sz = _post_embeddings(buf_chunks, job_id=job_id)
+            except Exception as e:
+                # échec -> journaliser et faire échouer le job proprement (on ne vide pas le buffer pour debug mais on arrête)
+                LOG.exception("[%s] Embeddings failed during flush: %s", job_id, e)
+                _append_log(job_id, f"Embeddings failed during flush: {e}")
+                _set_status(job_id, "error")
+                raise
             added = STORE.upsert(col, vecs, buf_metas)
             total_chunks += added
             _append_log(job_id, f"+{added} chunks (total={total_chunks}) ~{(sz/1024.0):.1f}KiB")
+            # vider buffers ONLY après succès
             buf_chunks, buf_metas = [], []
         # ✅ Filtre des fichiers pertinents
         IGNORE_PREFIXES = {".git", "__pycache__", ".vscode", ".idea", "node_modules", "build", "dist", "venv", ".env", ".log", ".tmp"}
         for fi, f in enumerate(req.files, 1):
+            # defensive: path/text peuvent être None -> utiliser fallback
+            path_raw = (getattr(f, "path", "") or "")  # peut être None
+            path = (path_raw or "").strip()
+            text_raw = (getattr(f, "text", "") or "")
+            text = text_raw or ""
             if not path:
+                # fallback path stable basé sur hash du texte (pour éviter collisions None)
+                h8 = hashlib.blake2b((text or "").encode("utf-8", "ignore"), digest_size=8).hexdigest()
+                path = f"__no_path__{h8}"
             if any(path.startswith(p) for p in IGNORE_PREFIXES):
                 _append_log(job_id, f"📁 Ignored: {path} (dossier ignoré)")
                 continue
             ext = os.path.splitext(path)[1].lower()
+            if ext not in TEXT_EXTS:
+                _append_log(job_id, f"📁 Ignored: {path} (extension non supportée: {ext})")
+                continue
+            if len((text or "").strip()) < 50:  # ✅ Ignorer les fichiers trop courts
+                _append_log(job_id, f"📄 Ignored: {path} (texte trop court: {len((text or '').strip())} chars)")
                 continue
             _append_log(job_id, f"📄 Processing: {path} ({len(text)} chars)")
             # --- traitement spécial JSON / NDJSON ---
             if ext in {".json"} or path.lower().endswith(".ndjson"):
                 handled = False
                 try:
                     parsed = json.loads(text)
                             s = _clean_chunk_text(s)
                             if len(s) < 30:
                                 continue
+                            # dedup global intra-job
+                            chash = hashlib.blake2b(s.encode("utf-8", "ignore"), digest_size=8).hexdigest()
+                            if chash in seen_chunk_hashes:
+                                continue
+                            seen_chunk_hashes.add(chash)
                             meta = {"path": path, "chunk": idx, "start": 0, "end": len(s)}
                             if req.store_text:
                                 meta["text"] = s
                         s = " ".join(str(v) for v in parsed.values() if isinstance(v, (str, int, float)))
                         s = _clean_chunk_text(s)
                         if len(s) >= 30:
+                            chash = hashlib.blake2b(s.encode("utf-8", "ignore"), digest_size=8).hexdigest()
+                            if chash not in seen_chunk_hashes:
+                                seen_chunk_hashes.add(chash)
+                                meta = {"path": path, "chunk": 0, "start": 0, "end": len(s)}
+                                if req.store_text:
+                                    meta["text"] = s
+                                buf_chunks.append(s); buf_metas.append(meta)
+                                if len(buf_chunks) >= req.batch_size:
+                                    _flush()
                         handled = True
                 except Exception:
+                    # fallback NDJSON: une ligne == un JSON ou texte
                     try:
+                        lines = [L for L in (text or "").splitlines() if L.strip()]
                         for li, line in enumerate(lines):
                             try:
                                 obj = json.loads(line)
                                 s = _clean_chunk_text(s)
                                 if len(s) < 30:
                                     continue
+                                chash = hashlib.blake2b(s.encode("utf-8", "ignore"), digest_size=8).hexdigest()
+                                if chash in seen_chunk_hashes:
+                                    continue
+                                seen_chunk_hashes.add(chash)
                                 meta = {"path": path, "chunk": li, "start": 0, "end": len(s)}
                                 if req.store_text:
                                     meta["text"] = s
                                     _flush()
                             except Exception:
                                 # ligne non JSON -> indexer comme texte si longue
+                                sl = (line or "").strip()
                                 if len(sl) >= 30:
                                     sl = _clean_chunk_text(sl)
+                                    chash = hashlib.blake2b(sl.encode("utf-8", "ignore"), digest_size=8).hexdigest()
+                                    if chash in seen_chunk_hashes:
+                                        continue
+                                    seen_chunk_hashes.add(chash)
                                     meta = {"path": path, "chunk": li, "start": 0, "end": len(sl)}
                                     if req.store_text:
                                         meta["text"] = sl
                     continue  # passe au fichier suivant
             # --- traitement normal pour fichiers texte ---
+            for ci, (start, end, chunk_txt) in enumerate(_chunk_with_spans(text or "", req.chunk_size, req.overlap)):
+                chunk_txt = (chunk_txt or "").strip()
                 if len(chunk_txt) < 30:  # ✅ Ignorer les chunks trop courts
                     continue
                 # nettoyage pour éviter artefacts JSON / timestamps
                 if len(chunk_txt) < 30:
                     continue
+                # dedup global intra-job (empêche répétitions)
+                chash = hashlib.blake2b(chunk_txt.encode("utf-8", "ignore"), digest_size=8).hexdigest()
+                if chash in seen_chunk_hashes:
+                    continue
+                seen_chunk_hashes.add(chash)
                 buf_chunks.append(chunk_txt)
                 meta = {
                     "path": path,