07Codex07 commited on
Commit
9f40f90
·
1 Parent(s): 7295f44

Fix: auto-download PDFs from HF dataset before loading

Browse files
Files changed (1) hide show
  1. chatbot_retriever.py +42 -1
chatbot_retriever.py CHANGED
@@ -66,6 +66,46 @@ SEARCH_EXPANSION = int(os.getenv("FAISS_SEARCH_EXPANSION", 5))
66
  logger = logging.getLogger(__name__)
67
  logger.setLevel(logging.INFO)
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
  def detect_subject(fname: str) -> Optional[str]:
71
  # light heuristic to guess subject code from filename
@@ -151,7 +191,8 @@ def load_all_docs(base_dir: str = DATA_DIR) -> List:
151
  def build_or_load_indexes(force_reindex: bool = False):
152
  if os.getenv("FORCE_REINDEX", "0").lower() in ("1", "true", "yes"):
153
  force_reindex = True
154
-
 
155
  docs = load_all_docs(DATA_DIR)
156
  if not docs:
157
  logger.warning("No documents found. Returning empty indexes.")
 
66
  logger = logging.getLogger(__name__)
67
  logger.setLevel(logging.INFO)
68
 
69
+ from huggingface_hub import hf_hub_download
70
+ import os
71
+
72
+ DATASET_REPO = "07Codex07/PrepGraph-Data"
73
+
74
+ def ensure_data_dir():
75
+ """Make sure data/ folder exists and contains downloaded PDFs from HF dataset."""
76
+ data_dir = os.getenv("DATA_DIR", "data")
77
+ os.makedirs(data_dir, exist_ok=True)
78
+
79
+ files = [
80
+ "cn.pdf",
81
+ "dos.pdf",
82
+ "pyqs/cn_pyq_2019.pdf",
83
+ "pyqs/cn_pyq_2020.pdf",
84
+ "pyqs/cn_pyq_2022.pdf",
85
+ "pyqs/cn_pyq_2023.pdf",
86
+ "pyqs/cn_pyq_2024.pdf",
87
+ "pyqs/cn_pyq_2028.pdf",
88
+ "pyqs/dos_pyq_2019.pdf",
89
+ "pyqs/dos_pyq_2020.pdf",
90
+ "pyqs/dos_pyq_2024.pdf",
91
+ "pyqs/se_pyq_2018.pdf",
92
+ "pyqs/se_pyq_2019.pdf",
93
+ "pyqs/se_pyq_2020(S).pdf",
94
+ "pyqs/se_pyq_2020.pdf",
95
+ "pyqs/se_pyq_2022.pdf",
96
+ "pyqs/se_pyq_2024.pdf",
97
+ ]
98
+
99
+ local_paths = []
100
+ for f in files:
101
+ local_path = os.path.join(data_dir, f.replace("/", "_"))
102
+ if not os.path.exists(local_path):
103
+ print(f"📥 Downloading {f} from Hugging Face...")
104
+ downloaded = hf_hub_download(repo_id=DATASET_REPO, filename=f)
105
+ os.rename(downloaded, local_path)
106
+ local_paths.append(local_path)
107
+ return local_paths
108
+
109
 
110
  def detect_subject(fname: str) -> Optional[str]:
111
  # light heuristic to guess subject code from filename
 
191
  def build_or_load_indexes(force_reindex: bool = False):
192
  if os.getenv("FORCE_REINDEX", "0").lower() in ("1", "true", "yes"):
193
  force_reindex = True
194
+
195
+ ensure_data_dir()
196
  docs = load_all_docs(DATA_DIR)
197
  if not docs:
198
  logger.warning("No documents found. Returning empty indexes.")