Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
|
@@ -15,7 +15,7 @@ ENV:
|
|
| 15 |
- HF_EMBED_MODEL (ex: "BAAI/bge-m3" | "intfloat/e5-base-v2")
|
| 16 |
- HUGGINGFACEHUB_API_TOKEN (requis si EMB_PROVIDER=hf)
|
| 17 |
- EMB_FALLBACK_TO_DUMMY (true/false)
|
| 18 |
-
- DATA_DIR (défaut
|
| 19 |
- HF_DATASET_REPO (optionnel "username/my_proj_vectors") pour export
|
| 20 |
- LOG_LEVEL (DEBUG par défaut)
|
| 21 |
- UI_PATH ("/ui")
|
|
@@ -44,7 +44,7 @@ import faiss # type: ignore
|
|
| 44 |
from pydantic import BaseModel, Field, ValidationError
|
| 45 |
from fastapi import FastAPI, HTTPException, Query
|
| 46 |
from fastapi.middleware.cors import CORSMiddleware
|
| 47 |
-
from fastapi.responses import RedirectResponse
|
| 48 |
|
| 49 |
from datasets import Dataset, Features, Sequence, Value, load_from_disk
|
| 50 |
|
|
@@ -69,15 +69,40 @@ HF_EMBED_MODEL = os.getenv("HF_EMBED_MODEL", "BAAI/bge-m3")
|
|
| 69 |
HF_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN", "")
|
| 70 |
EMB_FALLBACK_TO_DUMMY = os.getenv("EMB_FALLBACK_TO_DUMMY", "false").lower() in ("1","true","yes","on")
|
| 71 |
|
| 72 |
-
DATA_DIR = os.getenv("DATA_DIR", "/data")
|
| 73 |
-
os.makedirs(DATA_DIR, exist_ok=True)
|
| 74 |
-
|
| 75 |
UI_PATH = os.getenv("UI_PATH", "/ui")
|
| 76 |
HF_DATASET_REPO = os.getenv("HF_DATASET_REPO", "").strip() # optionnel
|
| 77 |
|
| 78 |
if EMB_PROVIDER == "hf" and not HF_TOKEN and not EMB_FALLBACK_TO_DUMMY:
|
| 79 |
LOG.warning("EMB_PROVIDER=hf sans HUGGINGFACEHUB_API_TOKEN (pas de fallback). Mets EMB_PROVIDER=dummy ou EMB_FALLBACK_TO_DUMMY=true pour tester.")
|
| 80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
# ------------------------------------------------------------------------------
|
| 82 |
# Modèles Pydantic
|
| 83 |
# ------------------------------------------------------------------------------
|
|
@@ -309,7 +334,7 @@ async def build_dataset_with_faiss(job: JobState, req: IndexRequest) -> None:
|
|
| 309 |
"chunk": Value("int32"),
|
| 310 |
"start": Value("int32"),
|
| 311 |
"end": Value("int32"),
|
| 312 |
-
"text": Value("string"),
|
| 313 |
"embedding": Sequence(Value("float32")),
|
| 314 |
})
|
| 315 |
|
|
@@ -403,7 +428,7 @@ fastapi_app.add_middleware(
|
|
| 403 |
|
| 404 |
@fastapi_app.get("/health")
|
| 405 |
async def health():
|
| 406 |
-
return {"status": "ok", "emb_provider": EMB_PROVIDER, "model": HF_EMBED_MODEL}
|
| 407 |
|
| 408 |
@fastapi_app.get("/api")
|
| 409 |
async def api_info():
|
|
@@ -415,6 +440,13 @@ async def api_info():
|
|
| 415 |
"hub_export_enabled": bool(HF_DATASET_REPO and HfApi),
|
| 416 |
}
|
| 417 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 418 |
@fastapi_app.get("/")
|
| 419 |
async def root_redirect():
|
| 420 |
return RedirectResponse(url=UI_PATH, status_code=307)
|
|
@@ -454,7 +486,6 @@ async def query(req: QueryRequest):
|
|
| 454 |
qvec = await embed_query(req.text)
|
| 455 |
if len(qvec) != vec_dim:
|
| 456 |
raise HTTPException(status_code=400, detail=f"Dim requête {len(qvec)} ≠ dim index {vec_dim}")
|
| 457 |
-
# get_nearest_examples renvoie (scores, examples)
|
| 458 |
scores, ex = ds.get_nearest_examples("embedding", np.array(qvec, dtype=np.float32), k=req.top_k)
|
| 459 |
results = []
|
| 460 |
for s, path, chunk, text in zip(scores, ex["path"], ex["chunk"], ex["text"]):
|
|
@@ -464,10 +495,6 @@ async def query(req: QueryRequest):
|
|
| 464 |
|
| 465 |
@fastapi_app.post("/export_hub")
|
| 466 |
async def export_hub(project_id: str = Query(..., min_length=1), repo_id: Optional[str] = None):
|
| 467 |
-
"""
|
| 468 |
-
Optionnel: push le dossier du projet (dataset + faiss + meta) dans un repo Dataset du Hub.
|
| 469 |
-
- HF_DATASET_REPO ou ?repo_id=... (ex: "chourmovs/deepweb_vectors")
|
| 470 |
-
"""
|
| 471 |
if not HfApi or not HF_TOKEN:
|
| 472 |
raise HTTPException(status_code=400, detail="huggingface_hub non dispo ou HF token absent.")
|
| 473 |
p = project_paths(project_id)
|
|
@@ -483,7 +510,7 @@ async def export_hub(project_id: str = Query(..., min_length=1), repo_id: Option
|
|
| 483 |
except Exception:
|
| 484 |
pass
|
| 485 |
|
| 486 |
-
#
|
| 487 |
buf = io.BytesIO()
|
| 488 |
base_dir = p["base"]
|
| 489 |
zip_name = f"{project_id}_vectors.zip"
|
|
@@ -514,7 +541,7 @@ def _default_two_docs() -> List[Dict[str, str]]:
|
|
| 514 |
|
| 515 |
async def ui_wipe(project: str):
|
| 516 |
try:
|
| 517 |
-
resp = await wipe(project)
|
| 518 |
return f"✅ Wipe ok — projet {resp['project_id']} vidé."
|
| 519 |
except Exception as e:
|
| 520 |
LOG.exception("wipe UI error")
|
|
|
|
| 15 |
- HF_EMBED_MODEL (ex: "BAAI/bge-m3" | "intfloat/e5-base-v2")
|
| 16 |
- HUGGINGFACEHUB_API_TOKEN (requis si EMB_PROVIDER=hf)
|
| 17 |
- EMB_FALLBACK_TO_DUMMY (true/false)
|
| 18 |
+
- DATA_DIR (par défaut: auto-pick writable: $DATA_DIR, ./data, /home/user/app/data, /home/user/data, /tmp/data)
|
| 19 |
- HF_DATASET_REPO (optionnel "username/my_proj_vectors") pour export
|
| 20 |
- LOG_LEVEL (DEBUG par défaut)
|
| 21 |
- UI_PATH ("/ui")
|
|
|
|
| 44 |
from pydantic import BaseModel, Field, ValidationError
|
| 45 |
from fastapi import FastAPI, HTTPException, Query
|
| 46 |
from fastapi.middleware.cors import CORSMiddleware
|
| 47 |
+
from fastapi.responses import RedirectResponse
|
| 48 |
|
| 49 |
from datasets import Dataset, Features, Sequence, Value, load_from_disk
|
| 50 |
|
|
|
|
| 69 |
HF_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN", "")
|
| 70 |
EMB_FALLBACK_TO_DUMMY = os.getenv("EMB_FALLBACK_TO_DUMMY", "false").lower() in ("1","true","yes","on")
|
| 71 |
|
|
|
|
|
|
|
|
|
|
| 72 |
UI_PATH = os.getenv("UI_PATH", "/ui")
|
| 73 |
HF_DATASET_REPO = os.getenv("HF_DATASET_REPO", "").strip() # optionnel
|
| 74 |
|
| 75 |
if EMB_PROVIDER == "hf" and not HF_TOKEN and not EMB_FALLBACK_TO_DUMMY:
|
| 76 |
LOG.warning("EMB_PROVIDER=hf sans HUGGINGFACEHUB_API_TOKEN (pas de fallback). Mets EMB_PROVIDER=dummy ou EMB_FALLBACK_TO_DUMMY=true pour tester.")
|
| 77 |
|
| 78 |
+
# ------------------------------------------------------------------------------
|
| 79 |
+
# Sélection robuste d'un DATA_DIR writable
|
| 80 |
+
# ------------------------------------------------------------------------------
|
| 81 |
+
def pick_data_dir() -> str:
|
| 82 |
+
candidates = [
|
| 83 |
+
os.getenv("DATA_DIR", "").strip(), # priorité à l'env si fourni
|
| 84 |
+
os.path.join(os.getcwd(), "data"), # ./data dans le WORKDIR (/app)
|
| 85 |
+
"/home/user/app/data", # chemins typiques HF Spaces
|
| 86 |
+
"/home/user/data",
|
| 87 |
+
"/tmp/data", # toujours writable
|
| 88 |
+
]
|
| 89 |
+
for p in candidates:
|
| 90 |
+
if not p:
|
| 91 |
+
continue
|
| 92 |
+
try:
|
| 93 |
+
os.makedirs(p, exist_ok=True)
|
| 94 |
+
testp = os.path.join(p, ".rw_test")
|
| 95 |
+
with open(testp, "w", encoding="utf-8") as f:
|
| 96 |
+
f.write("ok")
|
| 97 |
+
os.remove(testp)
|
| 98 |
+
LOG.info(f"[DATA_DIR] Utilisation de: {p}")
|
| 99 |
+
return p
|
| 100 |
+
except Exception as e:
|
| 101 |
+
LOG.warning(f"[DATA_DIR] Candidat non writable '{p}': {e}")
|
| 102 |
+
raise RuntimeError("Aucun répertoire DATA_DIR accessible en écriture.")
|
| 103 |
+
|
| 104 |
+
DATA_DIR = pick_data_dir()
|
| 105 |
+
|
| 106 |
# ------------------------------------------------------------------------------
|
| 107 |
# Modèles Pydantic
|
| 108 |
# ------------------------------------------------------------------------------
|
|
|
|
| 334 |
"chunk": Value("int32"),
|
| 335 |
"start": Value("int32"),
|
| 336 |
"end": Value("int32"),
|
| 337 |
+
"text": Value("string"),
|
| 338 |
"embedding": Sequence(Value("float32")),
|
| 339 |
})
|
| 340 |
|
|
|
|
| 428 |
|
| 429 |
@fastapi_app.get("/health")
|
| 430 |
async def health():
|
| 431 |
+
return {"status": "ok", "emb_provider": EMB_PROVIDER, "model": HF_EMBED_MODEL, "data_dir": DATA_DIR}
|
| 432 |
|
| 433 |
@fastapi_app.get("/api")
|
| 434 |
async def api_info():
|
|
|
|
| 440 |
"hub_export_enabled": bool(HF_DATASET_REPO and HfApi),
|
| 441 |
}
|
| 442 |
|
| 443 |
+
@fastapi_app.get("/debug/paths")
|
| 444 |
+
async def debug_paths(project_id: Optional[str] = None):
|
| 445 |
+
res = {"DATA_DIR": DATA_DIR, "cwd": os.getcwd()}
|
| 446 |
+
if project_id:
|
| 447 |
+
res["project_paths"] = project_paths(project_id)
|
| 448 |
+
return res
|
| 449 |
+
|
| 450 |
@fastapi_app.get("/")
|
| 451 |
async def root_redirect():
|
| 452 |
return RedirectResponse(url=UI_PATH, status_code=307)
|
|
|
|
| 486 |
qvec = await embed_query(req.text)
|
| 487 |
if len(qvec) != vec_dim:
|
| 488 |
raise HTTPException(status_code=400, detail=f"Dim requête {len(qvec)} ≠ dim index {vec_dim}")
|
|
|
|
| 489 |
scores, ex = ds.get_nearest_examples("embedding", np.array(qvec, dtype=np.float32), k=req.top_k)
|
| 490 |
results = []
|
| 491 |
for s, path, chunk, text in zip(scores, ex["path"], ex["chunk"], ex["text"]):
|
|
|
|
| 495 |
|
| 496 |
@fastapi_app.post("/export_hub")
|
| 497 |
async def export_hub(project_id: str = Query(..., min_length=1), repo_id: Optional[str] = None):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 498 |
if not HfApi or not HF_TOKEN:
|
| 499 |
raise HTTPException(status_code=400, detail="huggingface_hub non dispo ou HF token absent.")
|
| 500 |
p = project_paths(project_id)
|
|
|
|
| 510 |
except Exception:
|
| 511 |
pass
|
| 512 |
|
| 513 |
+
# zip le dossier projet
|
| 514 |
buf = io.BytesIO()
|
| 515 |
base_dir = p["base"]
|
| 516 |
zip_name = f"{project_id}_vectors.zip"
|
|
|
|
| 541 |
|
| 542 |
async def ui_wipe(project: str):
|
| 543 |
try:
|
| 544 |
+
resp = await wipe(project)
|
| 545 |
return f"✅ Wipe ok — projet {resp['project_id']} vidé."
|
| 546 |
except Exception as e:
|
| 547 |
LOG.exception("wipe UI error")
|