Spaces:
Sleeping
Sleeping
Fix: auto-download PDFs from HF dataset before loading
Browse files- chatbot_retriever.py +42 -1
chatbot_retriever.py
CHANGED
|
@@ -66,6 +66,46 @@ SEARCH_EXPANSION = int(os.getenv("FAISS_SEARCH_EXPANSION", 5))
|
|
| 66 |
logger = logging.getLogger(__name__)
|
| 67 |
logger.setLevel(logging.INFO)
|
| 68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
def detect_subject(fname: str) -> Optional[str]:
|
| 71 |
# light heuristic to guess subject code from filename
|
|
@@ -151,7 +191,8 @@ def load_all_docs(base_dir: str = DATA_DIR) -> List:
|
|
| 151 |
def build_or_load_indexes(force_reindex: bool = False):
|
| 152 |
if os.getenv("FORCE_REINDEX", "0").lower() in ("1", "true", "yes"):
|
| 153 |
force_reindex = True
|
| 154 |
-
|
|
|
|
| 155 |
docs = load_all_docs(DATA_DIR)
|
| 156 |
if not docs:
|
| 157 |
logger.warning("No documents found. Returning empty indexes.")
|
|
|
|
| 66 |
logger = logging.getLogger(__name__)
|
| 67 |
logger.setLevel(logging.INFO)
|
| 68 |
|
| 69 |
+
from huggingface_hub import hf_hub_download
|
| 70 |
+
import os
|
| 71 |
+
|
| 72 |
+
DATASET_REPO = "07Codex07/PrepGraph-Data"
|
| 73 |
+
|
| 74 |
+
def ensure_data_dir():
|
| 75 |
+
"""Make sure data/ folder exists and contains downloaded PDFs from HF dataset."""
|
| 76 |
+
data_dir = os.getenv("DATA_DIR", "data")
|
| 77 |
+
os.makedirs(data_dir, exist_ok=True)
|
| 78 |
+
|
| 79 |
+
files = [
|
| 80 |
+
"cn.pdf",
|
| 81 |
+
"dos.pdf",
|
| 82 |
+
"pyqs/cn_pyq_2019.pdf",
|
| 83 |
+
"pyqs/cn_pyq_2020.pdf",
|
| 84 |
+
"pyqs/cn_pyq_2022.pdf",
|
| 85 |
+
"pyqs/cn_pyq_2023.pdf",
|
| 86 |
+
"pyqs/cn_pyq_2024.pdf",
|
| 87 |
+
"pyqs/cn_pyq_2028.pdf",
|
| 88 |
+
"pyqs/dos_pyq_2019.pdf",
|
| 89 |
+
"pyqs/dos_pyq_2020.pdf",
|
| 90 |
+
"pyqs/dos_pyq_2024.pdf",
|
| 91 |
+
"pyqs/se_pyq_2018.pdf",
|
| 92 |
+
"pyqs/se_pyq_2019.pdf",
|
| 93 |
+
"pyqs/se_pyq_2020(S).pdf",
|
| 94 |
+
"pyqs/se_pyq_2020.pdf",
|
| 95 |
+
"pyqs/se_pyq_2022.pdf",
|
| 96 |
+
"pyqs/se_pyq_2024.pdf",
|
| 97 |
+
]
|
| 98 |
+
|
| 99 |
+
local_paths = []
|
| 100 |
+
for f in files:
|
| 101 |
+
local_path = os.path.join(data_dir, f.replace("/", "_"))
|
| 102 |
+
if not os.path.exists(local_path):
|
| 103 |
+
print(f"📥 Downloading {f} from Hugging Face...")
|
| 104 |
+
downloaded = hf_hub_download(repo_id=DATASET_REPO, filename=f)
|
| 105 |
+
os.rename(downloaded, local_path)
|
| 106 |
+
local_paths.append(local_path)
|
| 107 |
+
return local_paths
|
| 108 |
+
|
| 109 |
|
| 110 |
def detect_subject(fname: str) -> Optional[str]:
|
| 111 |
# light heuristic to guess subject code from filename
|
|
|
|
| 191 |
def build_or_load_indexes(force_reindex: bool = False):
|
| 192 |
if os.getenv("FORCE_REINDEX", "0").lower() in ("1", "true", "yes"):
|
| 193 |
force_reindex = True
|
| 194 |
+
|
| 195 |
+
ensure_data_dir()
|
| 196 |
docs = load_all_docs(DATA_DIR)
|
| 197 |
if not docs:
|
| 198 |
logger.warning("No documents found. Returning empty indexes.")
|