chouchouvs commited on
Commit
3660136
·
verified ·
1 Parent(s): 4aba207

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +41 -14
main.py CHANGED
@@ -15,7 +15,7 @@ ENV:
15
  - HF_EMBED_MODEL (ex: "BAAI/bge-m3" | "intfloat/e5-base-v2")
16
  - HUGGINGFACEHUB_API_TOKEN (requis si EMB_PROVIDER=hf)
17
  - EMB_FALLBACK_TO_DUMMY (true/false)
18
- - DATA_DIR (défaut "/data") stockage local par projet
19
  - HF_DATASET_REPO (optionnel "username/my_proj_vectors") pour export
20
  - LOG_LEVEL (DEBUG par défaut)
21
  - UI_PATH ("/ui")
@@ -44,7 +44,7 @@ import faiss # type: ignore
44
  from pydantic import BaseModel, Field, ValidationError
45
  from fastapi import FastAPI, HTTPException, Query
46
  from fastapi.middleware.cors import CORSMiddleware
47
- from fastapi.responses import RedirectResponse, StreamingResponse
48
 
49
  from datasets import Dataset, Features, Sequence, Value, load_from_disk
50
 
@@ -69,15 +69,40 @@ HF_EMBED_MODEL = os.getenv("HF_EMBED_MODEL", "BAAI/bge-m3")
69
  HF_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN", "")
70
  EMB_FALLBACK_TO_DUMMY = os.getenv("EMB_FALLBACK_TO_DUMMY", "false").lower() in ("1","true","yes","on")
71
 
72
- DATA_DIR = os.getenv("DATA_DIR", "/data")
73
- os.makedirs(DATA_DIR, exist_ok=True)
74
-
75
  UI_PATH = os.getenv("UI_PATH", "/ui")
76
  HF_DATASET_REPO = os.getenv("HF_DATASET_REPO", "").strip() # optionnel
77
 
78
  if EMB_PROVIDER == "hf" and not HF_TOKEN and not EMB_FALLBACK_TO_DUMMY:
79
  LOG.warning("EMB_PROVIDER=hf sans HUGGINGFACEHUB_API_TOKEN (pas de fallback). Mets EMB_PROVIDER=dummy ou EMB_FALLBACK_TO_DUMMY=true pour tester.")
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  # ------------------------------------------------------------------------------
82
  # Modèles Pydantic
83
  # ------------------------------------------------------------------------------
@@ -309,7 +334,7 @@ async def build_dataset_with_faiss(job: JobState, req: IndexRequest) -> None:
309
  "chunk": Value("int32"),
310
  "start": Value("int32"),
311
  "end": Value("int32"),
312
- "text": Value("string"), # peut contenir None -> sera "None" si None ; OK pour tests
313
  "embedding": Sequence(Value("float32")),
314
  })
315
 
@@ -403,7 +428,7 @@ fastapi_app.add_middleware(
403
 
404
  @fastapi_app.get("/health")
405
  async def health():
406
- return {"status": "ok", "emb_provider": EMB_PROVIDER, "model": HF_EMBED_MODEL}
407
 
408
  @fastapi_app.get("/api")
409
  async def api_info():
@@ -415,6 +440,13 @@ async def api_info():
415
  "hub_export_enabled": bool(HF_DATASET_REPO and HfApi),
416
  }
417
 
 
 
 
 
 
 
 
418
  @fastapi_app.get("/")
419
  async def root_redirect():
420
  return RedirectResponse(url=UI_PATH, status_code=307)
@@ -454,7 +486,6 @@ async def query(req: QueryRequest):
454
  qvec = await embed_query(req.text)
455
  if len(qvec) != vec_dim:
456
  raise HTTPException(status_code=400, detail=f"Dim requête {len(qvec)} ≠ dim index {vec_dim}")
457
- # get_nearest_examples renvoie (scores, examples)
458
  scores, ex = ds.get_nearest_examples("embedding", np.array(qvec, dtype=np.float32), k=req.top_k)
459
  results = []
460
  for s, path, chunk, text in zip(scores, ex["path"], ex["chunk"], ex["text"]):
@@ -464,10 +495,6 @@ async def query(req: QueryRequest):
464
 
465
  @fastapi_app.post("/export_hub")
466
  async def export_hub(project_id: str = Query(..., min_length=1), repo_id: Optional[str] = None):
467
- """
468
- Optionnel: push le dossier du projet (dataset + faiss + meta) dans un repo Dataset du Hub.
469
- - HF_DATASET_REPO ou ?repo_id=... (ex: "chourmovs/deepweb_vectors")
470
- """
471
  if not HfApi or not HF_TOKEN:
472
  raise HTTPException(status_code=400, detail="huggingface_hub non dispo ou HF token absent.")
473
  p = project_paths(project_id)
@@ -483,7 +510,7 @@ async def export_hub(project_id: str = Query(..., min_length=1), repo_id: Option
483
  except Exception:
484
  pass
485
 
486
- # Zipper le dossier projet pour un upload rapide
487
  buf = io.BytesIO()
488
  base_dir = p["base"]
489
  zip_name = f"{project_id}_vectors.zip"
@@ -514,7 +541,7 @@ def _default_two_docs() -> List[Dict[str, str]]:
514
 
515
  async def ui_wipe(project: str):
516
  try:
517
- resp = await wipe(project) # appelle route interne
518
  return f"✅ Wipe ok — projet {resp['project_id']} vidé."
519
  except Exception as e:
520
  LOG.exception("wipe UI error")
 
15
  - HF_EMBED_MODEL (ex: "BAAI/bge-m3" | "intfloat/e5-base-v2")
16
  - HUGGINGFACEHUB_API_TOKEN (requis si EMB_PROVIDER=hf)
17
  - EMB_FALLBACK_TO_DUMMY (true/false)
18
+ - DATA_DIR (par défaut: auto-pick writable: $DATA_DIR, ./data, /home/user/app/data, /home/user/data, /tmp/data)
19
  - HF_DATASET_REPO (optionnel "username/my_proj_vectors") pour export
20
  - LOG_LEVEL (DEBUG par défaut)
21
  - UI_PATH ("/ui")
 
44
  from pydantic import BaseModel, Field, ValidationError
45
  from fastapi import FastAPI, HTTPException, Query
46
  from fastapi.middleware.cors import CORSMiddleware
47
+ from fastapi.responses import RedirectResponse
48
 
49
  from datasets import Dataset, Features, Sequence, Value, load_from_disk
50
 
 
69
  HF_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN", "")
70
  EMB_FALLBACK_TO_DUMMY = os.getenv("EMB_FALLBACK_TO_DUMMY", "false").lower() in ("1","true","yes","on")
71
 
 
 
 
72
  UI_PATH = os.getenv("UI_PATH", "/ui")
73
  HF_DATASET_REPO = os.getenv("HF_DATASET_REPO", "").strip() # optionnel
74
 
75
  if EMB_PROVIDER == "hf" and not HF_TOKEN and not EMB_FALLBACK_TO_DUMMY:
76
  LOG.warning("EMB_PROVIDER=hf sans HUGGINGFACEHUB_API_TOKEN (pas de fallback). Mets EMB_PROVIDER=dummy ou EMB_FALLBACK_TO_DUMMY=true pour tester.")
77
 
78
+ # ------------------------------------------------------------------------------
79
+ # Sélection robuste d'un DATA_DIR writable
80
+ # ------------------------------------------------------------------------------
81
+ def pick_data_dir() -> str:
82
+ candidates = [
83
+ os.getenv("DATA_DIR", "").strip(), # priorité à l'env si fourni
84
+ os.path.join(os.getcwd(), "data"), # ./data dans le WORKDIR (/app)
85
+ "/home/user/app/data", # chemins typiques HF Spaces
86
+ "/home/user/data",
87
+ "/tmp/data", # toujours writable
88
+ ]
89
+ for p in candidates:
90
+ if not p:
91
+ continue
92
+ try:
93
+ os.makedirs(p, exist_ok=True)
94
+ testp = os.path.join(p, ".rw_test")
95
+ with open(testp, "w", encoding="utf-8") as f:
96
+ f.write("ok")
97
+ os.remove(testp)
98
+ LOG.info(f"[DATA_DIR] Utilisation de: {p}")
99
+ return p
100
+ except Exception as e:
101
+ LOG.warning(f"[DATA_DIR] Candidat non writable '{p}': {e}")
102
+ raise RuntimeError("Aucun répertoire DATA_DIR accessible en écriture.")
103
+
104
+ DATA_DIR = pick_data_dir()
105
+
106
  # ------------------------------------------------------------------------------
107
  # Modèles Pydantic
108
  # ------------------------------------------------------------------------------
 
334
  "chunk": Value("int32"),
335
  "start": Value("int32"),
336
  "end": Value("int32"),
337
+ "text": Value("string"),
338
  "embedding": Sequence(Value("float32")),
339
  })
340
 
 
428
 
429
  @fastapi_app.get("/health")
430
  async def health():
431
+ return {"status": "ok", "emb_provider": EMB_PROVIDER, "model": HF_EMBED_MODEL, "data_dir": DATA_DIR}
432
 
433
  @fastapi_app.get("/api")
434
  async def api_info():
 
440
  "hub_export_enabled": bool(HF_DATASET_REPO and HfApi),
441
  }
442
 
443
+ @fastapi_app.get("/debug/paths")
444
+ async def debug_paths(project_id: Optional[str] = None):
445
+ res = {"DATA_DIR": DATA_DIR, "cwd": os.getcwd()}
446
+ if project_id:
447
+ res["project_paths"] = project_paths(project_id)
448
+ return res
449
+
450
  @fastapi_app.get("/")
451
  async def root_redirect():
452
  return RedirectResponse(url=UI_PATH, status_code=307)
 
486
  qvec = await embed_query(req.text)
487
  if len(qvec) != vec_dim:
488
  raise HTTPException(status_code=400, detail=f"Dim requête {len(qvec)} ≠ dim index {vec_dim}")
 
489
  scores, ex = ds.get_nearest_examples("embedding", np.array(qvec, dtype=np.float32), k=req.top_k)
490
  results = []
491
  for s, path, chunk, text in zip(scores, ex["path"], ex["chunk"], ex["text"]):
 
495
 
496
  @fastapi_app.post("/export_hub")
497
  async def export_hub(project_id: str = Query(..., min_length=1), repo_id: Optional[str] = None):
 
 
 
 
498
  if not HfApi or not HF_TOKEN:
499
  raise HTTPException(status_code=400, detail="huggingface_hub non dispo ou HF token absent.")
500
  p = project_paths(project_id)
 
510
  except Exception:
511
  pass
512
 
513
+ # zip le dossier projet
514
  buf = io.BytesIO()
515
  base_dir = p["base"]
516
  zip_name = f"{project_id}_vectors.zip"
 
541
 
542
  async def ui_wipe(project: str):
543
  try:
544
+ resp = await wipe(project)
545
  return f"✅ Wipe ok — projet {resp['project_id']} vidé."
546
  except Exception as e:
547
  LOG.exception("wipe UI error")