Spaces:

chouchouvs
/

DeepIndex

Running

App Files Files Community

chouchouvs commited on Sep 7

Commit

c724d0c

verified ·

1 Parent(s): 152b314

Create main.py

Browse files

Files changed (1) hide show

main.py +234 -0

main.py ADDED Viewed

	@@ -0,0 +1,234 @@

+# -*- coding: utf-8 -*-
+from __future__ import annotations
+import os, time, uuid, logging
+from typing import List, Optional, Dict, Any, Tuple
+import requests
+import numpy as np
+from fastapi import FastAPI, BackgroundTasks, Header, HTTPException
+from pydantic import BaseModel, Field
+from qdrant_client import QdrantClient
+from qdrant_client.http.models import VectorParams, Distance, PointStruct
+logging.basicConfig(level=logging.INFO)
+LOG = logging.getLogger("remote_indexer")
+# ---------- ENV ----------
+AUTH_TOKEN = os.getenv("REMOTE_INDEX_TOKEN", "").strip()   # simple header auth
+HF_TOKEN   = os.getenv("HF_API_TOKEN", "").strip()
+HF_MODEL   = os.getenv("HF_EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
+HF_URL     = os.getenv("HF_API_URL", "").strip() or f"https://api-inference.huggingface.co/pipeline/feature-extraction/{HF_MODEL}"
+QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
+QDRANT_API = os.getenv("QDRANT_API_KEY", "").strip()
+if not HF_TOKEN:
+    LOG.warning("HF_API_TOKEN manquant — le service refusera /index et /query.")
+# ---------- Clients ----------
+qdr = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API if QDRANT_API else None)
+# ---------- Pydantic ----------
+class FileIn(BaseModel):
+    path: str
+    text: str
+class IndexRequest(BaseModel):
+    project_id: str = Field(..., min_length=1)
+    files: List[FileIn]
+    chunk_size: int = 1200
+    overlap: int = 200
+    batch_size: int = 8
+    store_text: bool = True
+class QueryRequest(BaseModel):
+    project_id: str
+    query: str
+    top_k: int = 6
+# ---------- Jobs store (en mémoire) ----------
+JOBS: Dict[str, Dict[str, Any]] = {}  # {job_id: {"status": "...", "logs": [...], "created": ts}}
+# ---------- Utils ----------
+def _auth(x_auth: Optional[str]):
+    if AUTH_TOKEN and (x_auth or "") != AUTH_TOKEN:
+        raise HTTPException(status_code=401, detail="Unauthorized")
+def _post_embeddings(batch: List[str]) -> Tuple[np.ndarray, int]:
+    if not HF_TOKEN:
+        raise RuntimeError("HF_API_TOKEN manquant (server).")
+    headers = {"Authorization": f"Bearer {HF_TOKEN}"}
+    r = requests.post(HF_URL, headers=headers, json=batch, timeout=120)
+    size = int(r.headers.get("Content-Length", "0"))
+    r.raise_for_status()
+    data = r.json()
+    arr = np.array(data, dtype=np.float32)
+    # arr: [batch, dim]   (sentence-transformers)
+    #  ou  [batch, tokens, dim]  -> mean pooling
+    if arr.ndim == 3:
+        arr = arr.mean(axis=1)
+    if arr.ndim != 2:
+        raise RuntimeError(f"Unexpected embeddings shape: {arr.shape}")
+    # normalisation
+    norms = np.linalg.norm(arr, axis=1, keepdims=True) + 1e-12
+    arr = arr / norms
+    return arr.astype(np.float32), size
+def _ensure_collection(name: str, dim: int):
+    try:
+        qdr.get_collection(name)
+        return
+    except Exception:
+        pass
+    qdr.create_collection(
+        collection_name=name,
+        vectors_config=VectorParams(size=dim, distance=Distance.COSINE),
+    )
+def _chunk_with_spans(text: str, size: int, overlap: int):
+    n = len(text)
+    if size <= 0:
+        yield (0, n, text)
+        return
+    i = 0
+    while i < n:
+        j = min(n, i + size)
+        yield (i, j, text[i:j])
+        i = max(0, j - overlap)
+        if i >= n:
+            break
+def _append_log(job_id: str, line: str):
+    job = JOBS.get(job_id)
+    if not job: return
+    job["logs"].append(line)
+def _set_status(job_id: str, status: str):
+    job = JOBS.get(job_id)
+    if not job: return
+    job["status"] = status
+# ---------- Background task ----------
+def run_index_job(job_id: str, req: IndexRequest):
+    try:
+        _set_status(job_id, "running")
+        total_chunks = 0
+        LOG.info(f"[{job_id}] Index start project={req.project_id} files={len(req.files)}")
+        _append_log(job_id, f"Start project={req.project_id} files={len(req.files)}")
+        # premier batch pour récupérer la dimension
+        # on prépare un mini lot
+        warmup = []
+        for f in req.files[:1]:
+            warmup.append(next(_chunk_with_spans(f.text, req.chunk_size, req.overlap))[2])
+        embs, sz = _post_embeddings(warmup)
+        dim = embs.shape[1]
+        col = f"proj_{req.project_id}"
+        _ensure_collection(col, dim)
+        _append_log(job_id, f"Collection ready: {col} (dim={dim})")
+        points_buffer: List[PointStruct] = []
+        point_id = 0
+        def flush_points():
+            nonlocal points_buffer
+            if points_buffer:
+                qdr.upsert(collection_name=col, points=points_buffer)
+                points_buffer = []
+        # boucle fichiers
+        for fi, f in enumerate(req.files, 1):
+            chunks, metas = [], []
+            for ci, (start, end, chunk_txt) in enumerate(_chunk_with_spans(f.text, req.chunk_size, req.overlap)):
+                chunks.append(chunk_txt)
+                payload = {"path": f.path, "chunk": ci, "start": start, "end": end}
+                if req.store_text:
+                    payload["text"] = chunk_txt
+                metas.append(payload)
+                if len(chunks) >= req.batch_size:
+                    vecs, sz = _post_embeddings(chunks)
+                    batch_points = []
+                    for k, vec in enumerate(vecs):
+                        batch_points.append(PointStruct(id=point_id, vector=vec.tolist(), payload=metas[k]))
+                        point_id += 1
+                    qdr.upsert(collection_name=col, points=batch_points)
+                    total_chunks += len(chunks)
+                    _append_log(job_id, f"file {fi}/{len(req.files)}: +{len(chunks)} chunks (total={total_chunks}) ~{sz/1024:.1f}KiB")
+                    chunks, metas = [], []
+            # flush fin de fichier
+            if chunks:
+                vecs, sz = _post_embeddings(chunks)
+                batch_points = []
+                for k, vec in enumerate(vecs):
+                    batch_points.append(PointStruct(id=point_id, vector=vec.tolist(), payload=metas[k]))
+                    point_id += 1
+                qdr.upsert(collection_name=col, points=batch_points)
+                total_chunks += len(chunks)
+                _append_log(job_id, f"file {fi}/{len(req.files)}: +{len(chunks)} chunks (total={total_chunks}) ~{sz/1024:.1f}KiB")
+        flush_points()
+        _append_log(job_id, f"Done. chunks={total_chunks}")
+        _set_status(job_id, "done")
+        LOG.info(f"[{job_id}] Index finished. chunks={total_chunks}")
+    except Exception as e:
+        LOG.exception("Index job failed")
+        _append_log(job_id, f"ERROR: {e}")
+        _set_status(job_id, "error")
+# ---------- API ----------
+app = FastAPI()
+@app.get("/health")
+def health():
+    return {"ok": True}
+@app.post("/index")
+def start_index(req: IndexRequest, background_tasks: BackgroundTasks, x_auth_token: Optional[str] = Header(default=None)):
+    _auth(x_auth_token)
+    if not HF_TOKEN:
+        raise HTTPException(400, "HF_API_TOKEN manquant côté serveur.")
+    job_id = uuid.uuid4().hex[:12]
+    JOBS[job_id] = {"status": "queued", "logs": [], "created": time.time()}
+    background_tasks.add_task(run_index_job, job_id, req)
+    return {"job_id": job_id}
+@app.get("/status/{job_id}")
+def status(job_id: str, x_auth_token: Optional[str] = Header(default=None)):
+    _auth(x_auth_token)
+    j = JOBS.get(job_id)
+    if not j:
+        raise HTTPException(404, "job inconnu")
+    return {"status": j["status"], "logs": j["logs"][-800:]}
+@app.post("/query")
+def query(req: QueryRequest, x_auth_token: Optional[str] = Header(default=None)):
+    _auth(x_auth_token)
+    if not HF_TOKEN:
+        raise HTTPException(400, "HF_API_TOKEN manquant côté serveur.")
+    vec, _ = _post_embeddings([req.query])
+    vec = vec[0].tolist()
+    col = f"proj_{req.project_id}"
+    try:
+        res = qdr.search(collection_name=col, query_vector=vec, limit=int(req.top_k))
+    except Exception as e:
+        raise HTTPException(400, f"Search failed: {e}")
+    out = []
+    for p in res:
+        pl = p.payload or {}
+        txt = pl.get("text")
+        # hard cap snippet size
+        if txt and len(txt) > 800:
+            txt = txt[:800] + "..."
+        out.append({"path": pl.get("path"), "chunk": pl.get("chunk"), "start": pl.get("start"), "end": pl.get("end"), "text": txt})
+    return {"results": out}
+@app.post("/wipe")
+def wipe_collection(project_id: str, x_auth_token: Optional[str] = Header(default=None)):
+    _auth(x_auth_token)
+    col = f"proj_{project_id}"
+    try:
+        qdr.delete_collection(col)
+        return {"ok": True}
+    except Exception as e:
+        raise HTTPException(400, f"wipe failed: {e}")