Spaces:
Sleeping
Sleeping
Fix: add repo_type=dataset and force_download=True for public dataset
Browse files- chatbot_retriever.py +9 -3
chatbot_retriever.py
CHANGED
|
@@ -100,13 +100,19 @@ def ensure_data_dir():
|
|
| 100 |
for f in files:
|
| 101 |
local_path = os.path.join(data_dir, f.replace("/", "_"))
|
| 102 |
if not os.path.exists(local_path):
|
| 103 |
-
print(f"📥 Downloading {f} from Hugging Face...")
|
| 104 |
-
downloaded = hf_hub_download(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
os.rename(downloaded, local_path)
|
| 106 |
local_paths.append(local_path)
|
| 107 |
return local_paths
|
| 108 |
|
| 109 |
|
|
|
|
| 110 |
def detect_subject(fname: str) -> Optional[str]:
|
| 111 |
# light heuristic to guess subject code from filename
|
| 112 |
t = (fname or "").lower()
|
|
@@ -191,7 +197,7 @@ def load_all_docs(base_dir: str = DATA_DIR) -> List:
|
|
| 191 |
def build_or_load_indexes(force_reindex: bool = False):
|
| 192 |
if os.getenv("FORCE_REINDEX", "0").lower() in ("1", "true", "yes"):
|
| 193 |
force_reindex = True
|
| 194 |
-
|
| 195 |
ensure_data_dir()
|
| 196 |
docs = load_all_docs(DATA_DIR)
|
| 197 |
if not docs:
|
|
|
|
| 100 |
for f in files:
|
| 101 |
local_path = os.path.join(data_dir, f.replace("/", "_"))
|
| 102 |
if not os.path.exists(local_path):
|
| 103 |
+
print(f"📥 Downloading {f} from Hugging Face (public dataset)...")
|
| 104 |
+
downloaded = hf_hub_download(
|
| 105 |
+
repo_id=DATASET_REPO,
|
| 106 |
+
filename=f,
|
| 107 |
+
repo_type="dataset", # ✅ tells HF it's a dataset
|
| 108 |
+
force_download=True # ✅ bypass any bad cached 401s
|
| 109 |
+
)
|
| 110 |
os.rename(downloaded, local_path)
|
| 111 |
local_paths.append(local_path)
|
| 112 |
return local_paths
|
| 113 |
|
| 114 |
|
| 115 |
+
|
| 116 |
def detect_subject(fname: str) -> Optional[str]:
|
| 117 |
# light heuristic to guess subject code from filename
|
| 118 |
t = (fname or "").lower()
|
|
|
|
| 197 |
def build_or_load_indexes(force_reindex: bool = False):
|
| 198 |
if os.getenv("FORCE_REINDEX", "0").lower() in ("1", "true", "yes"):
|
| 199 |
force_reindex = True
|
| 200 |
+
|
| 201 |
ensure_data_dir()
|
| 202 |
docs = load_all_docs(DATA_DIR)
|
| 203 |
if not docs:
|