07Codex07 commited on
Commit
aee4775
·
1 Parent(s): 708635f

Fix: add repo_type=dataset and force_download=True for public dataset

Browse files
Files changed (1) hide show
  1. chatbot_retriever.py +16 -9
chatbot_retriever.py CHANGED
@@ -72,7 +72,10 @@ import os
72
  DATASET_REPO = "07Codex07/PrepGraph-Data"
73
 
74
  def ensure_data_dir():
75
- """Ensure data/ folder exists and download PDFs from the Hugging Face dataset with correct subfolders."""
 
 
 
76
  data_dir = os.getenv("DATA_DIR", "data")
77
  os.makedirs(data_dir, exist_ok=True)
78
 
@@ -98,21 +101,25 @@ def ensure_data_dir():
98
 
99
  local_paths = []
100
  for f in files:
101
- # ✅ Keep the original folder structure (e.g., data/pyqs/...)
102
- download_path = os.path.join(data_dir, f)
103
- os.makedirs(os.path.dirname(download_path), exist_ok=True)
104
 
105
- if not os.path.exists(download_path):
106
  print(f"📥 Downloading {f} from Hugging Face (public dataset)...")
107
  downloaded = hf_hub_download(
108
  repo_id=DATASET_REPO,
109
  filename=f,
110
- repo_type="dataset", # ✅ specify dataset type
111
- force_download=True # ✅ force refresh cache if needed
112
  )
113
- os.replace(downloaded, download_path)
 
 
114
 
115
- local_paths.append(download_path)
 
 
 
116
 
117
  return local_paths
118
 
 
72
  DATASET_REPO = "07Codex07/PrepGraph-Data"
73
 
74
  def ensure_data_dir():
75
+ """Ensure data/ folder exists and contains the Hugging Face dataset PDFs properly structured."""
76
+ from huggingface_hub import hf_hub_download
77
+ import shutil
78
+
79
  data_dir = os.getenv("DATA_DIR", "data")
80
  os.makedirs(data_dir, exist_ok=True)
81
 
 
101
 
102
  local_paths = []
103
  for f in files:
104
+ dest_path = os.path.join(data_dir, f) # ✅ keep real folder structure
105
+ os.makedirs(os.path.dirname(dest_path), exist_ok=True)
 
106
 
107
+ if not os.path.exists(dest_path):
108
  print(f"📥 Downloading {f} from Hugging Face (public dataset)...")
109
  downloaded = hf_hub_download(
110
  repo_id=DATASET_REPO,
111
  filename=f,
112
+ repo_type="dataset",
113
+ force_download=True,
114
  )
115
+ shutil.copy(downloaded, dest_path) # ✅ copy instead of rename (works inside HF Spaces)
116
+
117
+ local_paths.append(dest_path)
118
 
119
+ # Debug info for verification
120
+ print(f"✅ Total files ensured: {len(local_paths)}")
121
+ for p in local_paths[:3]:
122
+ print(f" → {p}")
123
 
124
  return local_paths
125