Spaces:
Running
Running
Fix: add repo_type=dataset and force_download=True for public dataset
Browse files- chatbot_retriever.py +16 -9
chatbot_retriever.py
CHANGED
|
@@ -72,7 +72,10 @@ import os
|
|
| 72 |
DATASET_REPO = "07Codex07/PrepGraph-Data"
|
| 73 |
|
| 74 |
def ensure_data_dir():
|
| 75 |
-
"""Ensure data/ folder exists and
|
|
|
|
|
|
|
|
|
|
| 76 |
data_dir = os.getenv("DATA_DIR", "data")
|
| 77 |
os.makedirs(data_dir, exist_ok=True)
|
| 78 |
|
|
@@ -98,21 +101,25 @@ def ensure_data_dir():
|
|
| 98 |
|
| 99 |
local_paths = []
|
| 100 |
for f in files:
|
| 101 |
-
# ✅
|
| 102 |
-
|
| 103 |
-
os.makedirs(os.path.dirname(download_path), exist_ok=True)
|
| 104 |
|
| 105 |
-
if not os.path.exists(
|
| 106 |
print(f"📥 Downloading {f} from Hugging Face (public dataset)...")
|
| 107 |
downloaded = hf_hub_download(
|
| 108 |
repo_id=DATASET_REPO,
|
| 109 |
filename=f,
|
| 110 |
-
repo_type="dataset",
|
| 111 |
-
force_download=True
|
| 112 |
)
|
| 113 |
-
|
|
|
|
|
|
|
| 114 |
|
| 115 |
-
|
|
|
|
|
|
|
|
|
|
| 116 |
|
| 117 |
return local_paths
|
| 118 |
|
|
|
|
| 72 |
DATASET_REPO = "07Codex07/PrepGraph-Data"
|
| 73 |
|
| 74 |
def ensure_data_dir():
|
| 75 |
+
"""Ensure data/ folder exists and contains the Hugging Face dataset PDFs properly structured."""
|
| 76 |
+
from huggingface_hub import hf_hub_download
|
| 77 |
+
import shutil
|
| 78 |
+
|
| 79 |
data_dir = os.getenv("DATA_DIR", "data")
|
| 80 |
os.makedirs(data_dir, exist_ok=True)
|
| 81 |
|
|
|
|
| 101 |
|
| 102 |
local_paths = []
|
| 103 |
for f in files:
|
| 104 |
+
dest_path = os.path.join(data_dir, f) # ✅ keep real folder structure
|
| 105 |
+
os.makedirs(os.path.dirname(dest_path), exist_ok=True)
|
|
|
|
| 106 |
|
| 107 |
+
if not os.path.exists(dest_path):
|
| 108 |
print(f"📥 Downloading {f} from Hugging Face (public dataset)...")
|
| 109 |
downloaded = hf_hub_download(
|
| 110 |
repo_id=DATASET_REPO,
|
| 111 |
filename=f,
|
| 112 |
+
repo_type="dataset",
|
| 113 |
+
force_download=True,
|
| 114 |
)
|
| 115 |
+
shutil.copy(downloaded, dest_path) # ✅ copy instead of rename (works inside HF Spaces)
|
| 116 |
+
|
| 117 |
+
local_paths.append(dest_path)
|
| 118 |
|
| 119 |
+
# Debug info for verification
|
| 120 |
+
print(f"✅ Total files ensured: {len(local_paths)}")
|
| 121 |
+
for p in local_paths[:3]:
|
| 122 |
+
print(f" → {p}")
|
| 123 |
|
| 124 |
return local_paths
|
| 125 |
|