essprasad commited on
Commit
7829d29
·
verified ·
1 Parent(s): 394a257

Upload 10 files

Browse files
Files changed (10) hide show
  1. README.md +53 -0
  2. app.py +359 -0
  3. cleanup_space.py +135 -0
  4. gitattributes +49 -0
  5. gitignore +71 -0
  6. lfsconfig +4 -0
  7. postBuild +60 -0
  8. requirements.txt +43 -0
  9. runtime.txt +1 -0
  10. runtime.yaml +26 -0
README.md ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Clinical Research Chatbot
3
+ emoji: 🧪
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 5.49.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ # 🧪 Clinical Research Chatbot
13
+
14
+ A lightweight, fully open-source chatbot for clinical research professionals.
15
+ Runs entirely on Hugging Face — no OpenAI dependency.
16
+
17
+ ---
18
+
19
+ ## ✅ Current Features
20
+
21
+ ### 💬 Chatbot Interface
22
+ - Gradio UI with chatbot + Admin Tools tab.
23
+ - Query pipeline: **FAQ → Glossary → Knowledge Base → APIs (PubMed → FDA → ClinicalTrials.gov)**.
24
+ - Answers are clearly labeled by source.
25
+
26
+ ### 🔍 Knowledge Base (Docs + URLs)
27
+ - Supports ingestion of: PDF, DOCX, TXT, XLSX, JSON, HTML.
28
+ - Auto-ingests from:
29
+ - `/data/public_docs/`
30
+ - `/data/urls.txt`
31
+ - Smart chunking optimized for glossary terms + long text.
32
+
33
+ ### 📦 Vector Search
34
+ - FAISS + `all-MiniLM-L6-v2` embeddings.
35
+ - Persistent storage:
36
+ - `/persistent/faiss.index`
37
+ - `/persistent/faiss.index.meta.json`
38
+ - Index survives restarts and can be exported/imported as `.zip`.
39
+
40
+ ### 🌐 API Integrations
41
+ - PubMed
42
+ - FDA Drug Labels
43
+ - ClinicalTrials.gov
44
+
45
+ ### 🧠 Query Handling
46
+ - Glossary-aware normalization
47
+ *(e.g., eCRF, e-CRF, electronic case report form → same match)*
48
+ - Glossary priority: if glossary hit exists → always returned first.
49
+ - Answer flow: **FAQ → Glossary → KB → APIs**.
50
+ - Clear section labels, citations, and confidence notes.
51
+
52
+ ### 📜 Logging
53
+ All queries, answers, and sources saved in:
app.py ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ==========================================================
2
+ # SAFE-MODE PRELAUNCH CLEANUP
3
+ # ==========================================================
4
+ import os
5
+ import shutil
6
+ import time
7
+ import glob
8
+
9
+ # Prevent Svelte/Gradio SSR locale warning early
10
+ os.environ["GRADIO_LOCALE"] = "en"
11
+
12
+
13
+ def _prelaunch_cleanup(threshold_gb: float = 45.0):
14
+ """Pre-clean to avoid HF Spaces eviction while being conservative about persistent data."""
15
+ def _used_gb(path="/home/user/app"):
16
+ try:
17
+ total, used, free = shutil.disk_usage(path)
18
+ return round(min(used / (1024**3), 49.9), 2)
19
+ except Exception:
20
+ return 0.0
21
+
22
+ used = _used_gb()
23
+ print(f"\n💾 Startup disk usage: {used:.2f} GB")
24
+
25
+ # Only perform aggressive cleanup when over threshold.
26
+ if used > threshold_gb:
27
+ print(f"⚠️ Usage {used:.2f} GB > {threshold_gb} GB — performing aggressive cleanup.")
28
+ # preserve persistent / important artifacts by default
29
+ preserve = {"faiss.index", "faiss.index.meta.json", "glossary.json"}
30
+ for folder in ["/home/user/app/data/docs_cache", "/home/user/app/tmp_docs"]:
31
+ if os.path.exists(folder):
32
+ for f in glob.glob(os.path.join(folder, "*")):
33
+ name = os.path.basename(f)
34
+ if name in preserve:
35
+ continue
36
+ try:
37
+ if os.path.isdir(f):
38
+ shutil.rmtree(f, ignore_errors=True)
39
+ else:
40
+ os.remove(f)
41
+ except Exception:
42
+ pass
43
+ print("🧹 Aggressive cleanup complete.")
44
+
45
+ print(f"✨ Disk after cleanup: {_used_gb():.2f} GB\n")
46
+
47
+
48
+ _prelaunch_cleanup()
49
+
50
+
51
+ # ==========================================================
52
+ # MAIN APP — Clinical Trial Chatbot
53
+ # ==========================================================
54
+ import gradio as gr
55
+ from core.hybrid_retriever import summarize_combined
56
+
57
+ APP_TITLE = "🧠 Clinical Research Chatbot"
58
+ APP_DESC = (
59
+ "Ask any clinical research or GCP-related question. "
60
+ "Retrieves and summarizes from ICH, GCDMP, EMA, FDA, Excel, and Web datasets."
61
+ )
62
+
63
+
64
+ # ----------------------------------------------------------
65
+ # MODE & CREDENTIALS
66
+ # ----------------------------------------------------------
67
+ PUBLIC_MODE = os.environ.get("PUBLIC_MODE", "true").lower() == "true"
68
+ ADMIN_USER = os.environ.get("ADMIN_USER", "admin")
69
+ ADMIN_PASS = os.environ.get("ADMIN_PASS", "changeme")
70
+
71
+ print(f"🔐 Running in {'PUBLIC' if PUBLIC_MODE else 'ADMIN'} mode.")
72
+ print(f"🌍 Locale set to: {os.environ.get('GRADIO_LOCALE','en')}")
73
+ print(f"🧩 Env vars loaded: PUBLIC_MODE={PUBLIC_MODE}, ADMIN_USER={ADMIN_USER}")
74
+
75
+
76
+ # ----------------------------------------------------------
77
+ # AUTH HELPER
78
+ # ----------------------------------------------------------
79
+ def check_admin_login(username, password):
80
+ return username == ADMIN_USER and password == ADMIN_PASS
81
+
82
+
83
+ # ----------------------------------------------------------
84
+ # MAINTENANCE FUNCTIONS
85
+ # ----------------------------------------------------------
86
+ import json
87
+ import faiss
88
+ import pandas as pd
89
+ import numpy as np
90
+ import shutil as _shutil # alias to avoid shadowed name
91
+ from sentence_transformers import SentenceTransformer
92
+ from core.vector_sync import rebuild_faiss_from_glossary, _upload_to_dataset
93
+ from huggingface_hub import hf_hub_download, list_repo_files
94
+
95
+ DATA_PATHS = [
96
+ "/home/user/app/persistent/faiss.index",
97
+ "/home/user/app/persistent/faiss.index.meta.json",
98
+ "/home/user/app/data/docs_cache",
99
+ ]
100
+
101
+
102
+ def clear_index():
103
+ removed = []
104
+ for p in DATA_PATHS:
105
+ if os.path.isdir(p):
106
+ _shutil.rmtree(p, ignore_errors=True)
107
+ removed.append(f"🗑️ Deleted folder: {p}")
108
+ elif os.path.exists(p):
109
+ os.remove(p)
110
+ removed.append(f"🗑️ Deleted file: {p}")
111
+ msg = "\n".join(removed) if removed else "ℹ️ No cache files found."
112
+ print(msg)
113
+ return msg
114
+
115
+
116
+ def rebuild_index():
117
+ """Rebuild FAISS index from glossary + Excel + web."""
118
+ try:
119
+ import os
120
+ import json
121
+ import pandas as pd
122
+ import faiss
123
+ import numpy as np
124
+ from sentence_transformers import SentenceTransformer
125
+
126
+ from core.web_loader import web_crawler_loader # may raise; handled below
127
+
128
+ repo_id_index = "essprasad/CT-Chat-Index"
129
+ repo_id_docs = "essprasad/CT-Chat-Docs"
130
+ local_dir = "/home/user/app/persistent"
131
+ os.makedirs(local_dir, exist_ok=True)
132
+
133
+ print("🧠 Rebuilding FAISS index (Glossary + Excel + Web)…")
134
+
135
+ # --- Ensure glossary.json exists (download if missing)
136
+ glossary_path = os.path.join(local_dir, "glossary.json")
137
+ if not os.path.exists(glossary_path):
138
+ try:
139
+ print("📥 glossary.json missing locally — downloading from HF index dataset...")
140
+ downloaded = hf_hub_download(repo_id=repo_id_index, filename="persistent/glossary.json", repo_type="dataset")
141
+ # copy to local persistent path
142
+ _shutil.copy2(downloaded, glossary_path)
143
+ print("✅ Downloaded glossary.json.")
144
+ except Exception as e:
145
+ print(f"⚠️ Could not download glossary.json: {e}. Proceeding if available in other sources.")
146
+
147
+ # Rebuild FAISS from glossary (this returns an index object and metadata list)
148
+ index, metas = rebuild_faiss_from_glossary(glossary_path=glossary_path)
149
+ print(f"📘 Loaded {len(metas)} glossary entries.")
150
+
151
+ # --- 3️⃣ Index Excel (MRCT Glossary)
152
+ print("📑 Scanning Excel files in dataset…")
153
+ repo_files = list_repo_files(repo_id_docs, repo_type="dataset")
154
+ excel_files = [f for f in repo_files if f.lower().endswith((".xlsx", ".xls"))]
155
+
156
+ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
157
+ excel_entries = []
158
+
159
+ for file_name in excel_files:
160
+ print(f"📄 Reading {file_name}…")
161
+ try:
162
+ path = hf_hub_download(repo_id_docs, filename=file_name, repo_type="dataset")
163
+ xls = pd.read_excel(path, sheet_name=None)
164
+ for sheet, df in xls.items():
165
+ if "Glossary Term" not in df.columns:
166
+ continue
167
+ df = df.fillna("").dropna(how="all")
168
+ for _, row in df.iterrows():
169
+ term = str(row.get("Glossary Term", "")).strip()
170
+ if not term:
171
+ continue
172
+
173
+ # Combine all the relevant MRCT fields
174
+ combined_text = (
175
+ f"Glossary Term: {term}\n"
176
+ f"Glossary Definition: {row.get('Glossary Definition','')}\n"
177
+ f"Use in Context: {row.get('Use in Context','')}\n"
178
+ f"More Info: {row.get('More Info','')}\n"
179
+ f"Other Info to Think About When Joining a Study: {row.get('Other Info to Think About When Joining a Study','')}\n"
180
+ f"Related Terms: {row.get('Related Terms','')}\n"
181
+ f"Other Resources: {row.get('Other Resources','')}\n"
182
+ f"Term URL: {row.get('Term URL','')}\n"
183
+ f"CDISC/NCI URL: {row.get('CDISC/NCI URL','')}\n"
184
+ f"Version: {row.get('Version','')}"
185
+ ).strip()
186
+
187
+ excel_entries.append({
188
+ "source": file_name,
189
+ "sheet": sheet,
190
+ "term": term,
191
+ "type": "Excel",
192
+ "file": file_name,
193
+ "text": combined_text
194
+ })
195
+ except Exception as e:
196
+ print(f"⚠️ Error reading {file_name}: {e}")
197
+
198
+ if excel_entries:
199
+ texts = [e["text"] for e in excel_entries]
200
+ embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True).astype("float32")
201
+ faiss.normalize_L2(embeddings)
202
+ index.add(embeddings)
203
+ metas.extend(excel_entries)
204
+ print(f"✅ Added {len(excel_entries)} Excel entries to FAISS.")
205
+
206
+
207
+ # ---- Optional: Load web content (may be slow)
208
+ try:
209
+ print("🌐 Loading and embedding web sources…")
210
+ web_entries = web_crawler_loader(
211
+ urls_file="/home/user/app/data/urls.txt",
212
+ cache_path="/home/user/app/persistent/web_cache.json",
213
+ max_pages=3,
214
+ timeout=20,
215
+ force_refresh=False,
216
+ )
217
+ if web_entries:
218
+ web_entries = [e for e in web_entries if len(e.get("text", "")) > 200]
219
+ print(f"✅ Retrieved {len(web_entries)} web entries.")
220
+ web_texts = [e["text"] for e in web_entries]
221
+ web_emb = model.encode(web_texts, show_progress_bar=True, convert_to_numpy=True).astype("float32")
222
+ faiss.normalize_L2(web_emb)
223
+ index.add(web_emb)
224
+ metas.extend(web_entries)
225
+ print("✅ Web content added to FAISS.")
226
+ except Exception as e:
227
+ print(f"⚠️ Web content embedding failed: {e}")
228
+
229
+ # --- Save index + meta locally
230
+ faiss_path = os.path.join(local_dir, "faiss.index")
231
+ meta_path = os.path.join(local_dir, "faiss.index.meta.json")
232
+ faiss.write_index(index, faiss_path)
233
+ with open(meta_path, "w", encoding="utf-8") as f:
234
+ json.dump(metas, f, indent=2)
235
+ print(f"💾 Local FAISS saved ({len(metas)} entries).")
236
+
237
+ # --- Upload artifacts back to HF dataset (best-effort)
238
+ try:
239
+ _upload_to_dataset(faiss_path, meta_path, repo_id_index)
240
+ print(f"☁️ Uploaded FAISS ({len(metas)} entries) to {repo_id_index}.")
241
+ except Exception as e:
242
+ print(f"⚠️ Upload failed: {e}")
243
+
244
+ return f"✅ Rebuild complete: {len(metas)} entries (Glossary + Excel + Web)."
245
+ except Exception as e:
246
+ return f"⚠️ Rebuild failed: {e}"
247
+
248
+
249
+ def rebuild_glossary():
250
+ try:
251
+ from core.glossary_builder import rebuild_and_upload
252
+ rebuild_and_upload()
253
+ return "✅ Glossary rebuilt and uploaded successfully."
254
+ except Exception as e:
255
+ return f"⚠️ Glossary rebuild failed: {e}"
256
+
257
+
258
+ def reset_faiss_cache():
259
+ """
260
+ Completely clears local FAISS and glossary caches, reloads the vector_store module
261
+ (to wipe in-memory runtime caches), then rebuilds glossary + index.
262
+ """
263
+ try:
264
+ # Use the clear helper from core.vector_store if available
265
+ from importlib import reload
266
+ from core import vector_store
267
+
268
+ # If vector_store exposes clear_local_faiss, use it (safe and logged)
269
+ if hasattr(vector_store, "clear_local_faiss"):
270
+ vector_store.clear_local_faiss()
271
+ else:
272
+ # fallback: manually delete persistent/runtime files
273
+ paths = [
274
+ "/home/user/app/persistent/faiss.index",
275
+ "/home/user/app/persistent/faiss.index.meta.json",
276
+ "/home/user/app/persistent/glossary.json",
277
+ "/home/user/app/runtime_faiss",
278
+ ]
279
+ for p in paths:
280
+ if os.path.exists(p):
281
+ try:
282
+ if os.path.isdir(p):
283
+ _shutil.rmtree(p, ignore_errors=True)
284
+ else:
285
+ os.remove(p)
286
+ print(f"🗑️ Deleted: {p}")
287
+ except Exception:
288
+ pass
289
+
290
+ # reload the module to clear any in-memory caches
291
+ reload(vector_store)
292
+ print("♻️ FAISS runtime module reloaded to ensure fresh index rebuild.")
293
+
294
+ msg = "🧹 Local FAISS + glossary cache cleared. Starting full rebuild...\n\n"
295
+ msg += rebuild_glossary() + "\n"
296
+ msg += rebuild_index()
297
+ return msg
298
+ except Exception as e:
299
+ return f"⚠️ Reset failed: {e}"
300
+
301
+
302
+ # ----------------------------------------------------------
303
+ # CHATBOT CORE
304
+ # ----------------------------------------------------------
305
+ def chat_answer(query, mode="short"):
306
+ try:
307
+ if not query or not str(query).strip():
308
+ return "<i>⚠️ Please enter a valid query.</i>"
309
+ return summarize_combined(str(query).strip(), mode=mode)
310
+ except Exception as e:
311
+ print("❌ Chatbot error:", e)
312
+ return f"<i>⚠️ Error: {e}</i>"
313
+
314
+
315
+ # ----------------------------------------------------------
316
+ # GRADIO UI
317
+ # ----------------------------------------------------------
318
+ with gr.Blocks(theme="gradio/soft") as demo:
319
+ gr.Markdown(f"# {APP_TITLE}")
320
+ gr.Markdown(APP_DESC)
321
+
322
+ query_box = gr.Textbox(
323
+ label="Ask your clinical trial question",
324
+ placeholder="e.g. What is an eCRF?",
325
+ lines=2,
326
+ )
327
+ output_box = gr.HTML(label="Answer")
328
+
329
+ with gr.Row():
330
+ submit_btn = gr.Button("🚀 Submit", variant="primary")
331
+ if not PUBLIC_MODE:
332
+ rebuild_btn = gr.Button("🔁 Rebuild Index")
333
+ rebuild_glossary_btn = gr.Button("📘 Rebuild Glossary")
334
+ reset_btn = gr.Button("🧹 Reset FAISS Cache (Full Rebuild)")
335
+ clear_btn = gr.Button("🗑️ Clear Index Only")
336
+
337
+ submit_btn.click(fn=chat_answer, inputs=[query_box], outputs=output_box)
338
+ query_box.submit(fn=chat_answer, inputs=[query_box], outputs=output_box)
339
+
340
+ if not PUBLIC_MODE:
341
+ rebuild_btn.click(fn=rebuild_index, outputs=output_box)
342
+ rebuild_glossary_btn.click(fn=rebuild_glossary, outputs=output_box)
343
+ reset_btn.click(fn=reset_faiss_cache, outputs=output_box)
344
+ clear_btn.click(fn=clear_index, outputs=output_box)
345
+
346
+
347
+ # ----------------------------------------------------------
348
+ # LAUNCH APP
349
+ # ----------------------------------------------------------
350
+ if __name__ == "__main__":
351
+ print("🚀 Starting Clinical Trial Chatbot…")
352
+ print("🧠 Initializing retriever warm-up…")
353
+ demo.launch(
354
+ server_name="0.0.0.0",
355
+ server_port=7860,
356
+ share=False,
357
+ auth=check_admin_login if not PUBLIC_MODE else None,
358
+ ssr_mode=False,
359
+ )
cleanup_space.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ cleanup_space.py
3
+ ----------------
4
+ Maintenance script for Hugging Face Space cleanup.
5
+ - Removes caches, temp files, and large unneeded assets.
6
+ - Keeps only FAISS index + metadata + glossary.
7
+ - Reuploads them to CT-Chat-Index dataset.
8
+ """
9
+ import subprocess
10
+ subprocess.run(["python", "cleanup_space.py"], check=False)
11
+ import os
12
+ import shutil
13
+ import time
14
+ from datetime import datetime
15
+ from huggingface_hub import HfApi, upload_file, HfFolder
16
+
17
+ # 🔧 Configuration
18
+ REPO_ID = "essprasad/CT-Chat-Index" # Dataset repo
19
+ REPO_TYPE = "dataset"
20
+ PERSISTENT_DIR = "persistent"
21
+ DATA_DIR = "data"
22
+ KEEP_FILES = [
23
+ "persistent/faiss.index",
24
+ "persistent/faiss.index.meta.json",
25
+ "data/glossary.json"
26
+ ]
27
+
28
+ api = HfApi()
29
+ token = HfFolder.get_token() or os.getenv("HF_TOKEN", None)
30
+
31
+ def readable_size(path):
32
+ """Return human-readable folder size."""
33
+ total = 0
34
+ for dirpath, _, filenames in os.walk(path):
35
+ for f in filenames:
36
+ fp = os.path.join(dirpath, f)
37
+ if os.path.exists(fp):
38
+ total += os.path.getsize(fp)
39
+ for unit in ["B", "KB", "MB", "GB"]:
40
+ if total < 1024.0:
41
+ return f"{total:.2f} {unit}"
42
+ total /= 1024.0
43
+ return f"{total:.2f} TB"
44
+
45
+ # --------------------------------------------------------------------
46
+ # 1. Clean caches, logs, temp files
47
+ # --------------------------------------------------------------------
48
+ def clean_temp_and_cache():
49
+ print("🧹 Cleaning temporary and cache directories...")
50
+ for path in ["/root/.cache", "/home/user/.cache", "/tmp"]:
51
+ shutil.rmtree(path, ignore_errors=True)
52
+ os.makedirs(path, exist_ok=True)
53
+
54
+ # Remove logs larger than 5 MB
55
+ log_dir = "logs"
56
+ if os.path.exists(log_dir):
57
+ for f in os.listdir(log_dir):
58
+ fp = os.path.join(log_dir, f)
59
+ if os.path.isfile(fp) and os.path.getsize(fp) > 5 * 1024 * 1024:
60
+ os.remove(fp)
61
+ print(f"🗑️ Removed oversized log: {fp}")
62
+
63
+ # --------------------------------------------------------------------
64
+ # 2. Remove large documents & orphan files
65
+ # --------------------------------------------------------------------
66
+ def trim_data():
67
+ print("📦 Trimming large files from data/public_docs...")
68
+ doc_dir = os.path.join(DATA_DIR, "public_docs")
69
+ if not os.path.exists(doc_dir):
70
+ return
71
+
72
+ for root, _, files in os.walk(doc_dir):
73
+ for f in files:
74
+ fp = os.path.join(root, f)
75
+ if os.path.getsize(fp) > 10 * 1024 * 1024: # >10MB
76
+ print(f"🗑️ Removing large doc: {fp}")
77
+ os.remove(fp)
78
+
79
+ # --------------------------------------------------------------------
80
+ # 3. Verify and keep only essential files
81
+ # --------------------------------------------------------------------
82
+ def preserve_key_files():
83
+ print("🔒 Preserving essential files (index + glossary)...")
84
+ all_keep = []
85
+ for f in KEEP_FILES:
86
+ if os.path.exists(f):
87
+ print(f"✅ Keeping: {f}")
88
+ all_keep.append(f)
89
+ else:
90
+ print(f"⚠️ Missing expected file: {f}")
91
+ return all_keep
92
+
93
+ # --------------------------------------------------------------------
94
+ # 4. Upload cleaned files to dataset
95
+ # --------------------------------------------------------------------
96
+ def upload_to_hub(files):
97
+ if not token:
98
+ print("❌ No HF token found. Please add HF_TOKEN with write access.")
99
+ return
100
+ print(f"🚀 Uploading cleaned files to {REPO_ID} ...")
101
+ for f in files:
102
+ try:
103
+ upload_file(
104
+ path_or_fileobj=f,
105
+ path_in_repo=f,
106
+ repo_id=REPO_ID,
107
+ repo_type=REPO_TYPE,
108
+ token=token,
109
+ commit_message=f"Auto-cleanup sync {datetime.utcnow().isoformat()}"
110
+ )
111
+ print(f"✅ Uploaded: {f}")
112
+ except Exception as e:
113
+ print(f"⚠️ Failed to upload {f}: {e}")
114
+
115
+ # --------------------------------------------------------------------
116
+ # 5. Disk usage report
117
+ # --------------------------------------------------------------------
118
+ def report_usage():
119
+ print("\n📊 Disk Usage Summary:")
120
+ for path in ["persistent", "data", "/home/user"]:
121
+ if os.path.exists(path):
122
+ print(f"{path}: {readable_size(path)}")
123
+
124
+ # --------------------------------------------------------------------
125
+ # Run everything
126
+ # --------------------------------------------------------------------
127
+ if __name__ == "__main__":
128
+ start = time.time()
129
+ print("===== 🧹 Starting Space Cleanup =====")
130
+ clean_temp_and_cache()
131
+ trim_data()
132
+ files = preserve_key_files()
133
+ upload_to_hub(files)
134
+ report_usage()
135
+ print(f"\n✅ Cleanup finished in {time.time() - start:.2f}s")
gitattributes ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ================================================
2
+ # ⚙️ Clinical Research Chatbot – Simplified .gitattributes
3
+ # ================================================
4
+ # Version: Safe for Hugging Face UI-only management
5
+ # (no Git LFS required)
6
+ # --------------------------------
7
+
8
+ # --------------------------------
9
+ # Code & Config Files (text mode)
10
+ # --------------------------------
11
+ *.py text eol=lf
12
+ *.txt text eol=lf
13
+ *.md text eol=lf
14
+ *.json text eol=lf
15
+ *.csv text eol=lf
16
+ *.yaml text eol=lf
17
+ *.yml text eol=lf
18
+ *.html text eol=lf
19
+ *.css text eol=lf
20
+ *.js text eol=lf
21
+ *.ini text eol=lf
22
+ *.cfg text eol=lf
23
+ *.toml text eol=lf
24
+ requirements.txt text eol=lf
25
+ runtime.txt text eol=lf
26
+ runtime.yaml text eol=lf
27
+ *.gitignore text eol=lf
28
+ *.gitattributes text eol=lf
29
+
30
+ # --------------------------------
31
+ # Binary & Data Files (no LFS)
32
+ # --------------------------------
33
+ *.pdf binary
34
+ *.docx binary
35
+ *.xlsx binary
36
+ *.zip binary
37
+ *.ppt binary
38
+ *.odt binary
39
+ *.png binary
40
+ *.jpg binary
41
+ *.jpeg binary
42
+ *.tif binary
43
+ *.tiff binary
44
+ *.gif binary
45
+
46
+ # --------------------------------
47
+ # Default handling
48
+ # --------------------------------
49
+ * text=auto eol=lf
gitignore ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =========================================
2
+ # 🧪 Clinical Research Chatbot – .gitignore
3
+ # =========================================
4
+
5
+ # -------------------------
6
+ # Python
7
+ # -------------------------
8
+ __pycache__/
9
+ *.pyc
10
+ *.pyo
11
+ *.pyd
12
+ *.pkl
13
+ *.pickle
14
+
15
+ # -------------------------
16
+ # Environment / virtualenv
17
+ # -------------------------
18
+ .venv/
19
+ env/
20
+ venv/
21
+ ENV/
22
+ *.env
23
+
24
+ # -------------------------
25
+ # Data & Logs
26
+ # -------------------------
27
+ logs/*
28
+ !logs/.gitkeep
29
+ !logs/query_log.csv # keep recent chatbot logs
30
+
31
+ # -------------------------
32
+ # Data Folders
33
+ # -------------------------
34
+ # Keep reference docs & FAQs, ignore temporary files
35
+ data/public_docs/*
36
+ !data/public_docs/.gitkeep
37
+
38
+ data/faq/*
39
+ !data/faq/.gitkeep
40
+
41
+ # Glossary and metadata files should stay (important for chatbot)
42
+ !data/glossary.json
43
+ !data/faq_data.json
44
+ !data/clinical_faq.json
45
+
46
+ # Ignore temporary FAISS or index rebuilds
47
+ persistent/*
48
+ !persistent/.gitkeep
49
+ !persistent/faiss.index
50
+ !persistent/faiss.index.meta.json
51
+
52
+ # -------------------------
53
+ # Hugging Face + Transformers cache
54
+ # -------------------------
55
+ .cache/
56
+ datasets/
57
+ transformers_cache/
58
+ .huggingface/
59
+
60
+ # -------------------------
61
+ # IDE / Editor
62
+ # -------------------------
63
+ .vscode/
64
+ .idea/
65
+ .DS_Store
66
+
67
+ # -------------------------
68
+ # Miscellaneous
69
+ # -------------------------
70
+ *.tmp
71
+ *.bak
lfsconfig ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ [lfs]
2
+ url = https://huggingface.co/
3
+ locksverify = true
4
+ batch = true
postBuild ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -e
3
+
4
+ echo "🔧 PostBuild starting — optimizing CT-Chat Space..."
5
+
6
+ # -------------------------------------------------------
7
+ # 1️⃣ Fix dependency mismatches (Gradio & Websockets)
8
+ # -------------------------------------------------------
9
+ pip install --force-reinstall --no-cache-dir "websockets>=12" "gradio-client>=1.3.0"
10
+
11
+ # -------------------------------------------------------
12
+ # 2️⃣ Create and register shared NLTK data directory
13
+ # -------------------------------------------------------
14
+ echo "📁 Preparing shared NLTK data directory..."
15
+ export NLTK_DATA="/usr/local/share/nltk_data"
16
+ mkdir -p $NLTK_DATA
17
+ chmod -R 777 $NLTK_DATA
18
+
19
+ # -------------------------------------------------------
20
+ # 3️⃣ Preload all required NLTK resources (including punkt_tab)
21
+ # -------------------------------------------------------
22
+ echo "📦 Downloading NLTK resources..."
23
+ python -m nltk.downloader -d $NLTK_DATA \
24
+ punkt punkt_tab averaged_perceptron_tagger averaged_perceptron_tagger_eng stopwords wordnet omw-1.4
25
+
26
+ # -------------------------------------------------------
27
+ # 4️⃣ Verify NLTK installs and paths
28
+ # -------------------------------------------------------
29
+ python - <<'PYCODE'
30
+ import nltk, os
31
+ print(f"NLTK data path → {nltk.data.path}")
32
+ for pkg in ["punkt", "punkt_tab", "averaged_perceptron_tagger_eng", "stopwords", "wordnet"]:
33
+ try:
34
+ nltk.data.find(pkg)
35
+ print(f"✅ Verified NLTK resource: {pkg}")
36
+ except LookupError:
37
+ print(f"⚠️ Missing NLTK resource: {pkg}")
38
+ PYCODE
39
+
40
+ # -------------------------------------------------------
41
+ # 5️⃣ Clean caches (stay <50GB)
42
+ # -------------------------------------------------------
43
+ echo "🧹 Cleaning Hugging Face + Torch caches..."
44
+ rm -rf /root/.cache/* || true
45
+ rm -rf /home/user/.cache/* || true
46
+ rm -rf /usr/local/share/nltk_data/taggers/__pycache__ || true
47
+ rm -rf /home/user/app/hf_cache/* || true
48
+ rm -rf /home/user/app/logs/* || true
49
+
50
+ # -------------------------------------------------------
51
+ # 6️⃣ Ensure writable temporary cache for runtime
52
+ # -------------------------------------------------------
53
+ echo "📦 Preparing /tmp/hf_cache..."
54
+ mkdir -p /tmp/hf_cache
55
+ chmod -R 777 /tmp/hf_cache
56
+
57
+ # -------------------------------------------------------
58
+ # ✅ Done
59
+ # -------------------------------------------------------
60
+ echo "✅ PostBuild completed successfully — NLTK preloaded (punkt_tab OK), cache ready at /tmp/hf_cache."
requirements.txt ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =======================================
2
+ # 🧪 Clinical Research Chatbot Requirements
3
+ # =======================================
4
+
5
+ # --- Core Libraries ---
6
+ faiss-cpu
7
+ torch
8
+ transformers
9
+ sentence-transformers
10
+ sentencepiece
11
+ fastapi
12
+ whoosh
13
+
14
+ # --- Data Handling ---
15
+ numpy
16
+ pandas
17
+ datasets
18
+
19
+ # --- Document Parsing ---
20
+ pymupdf
21
+ python-docx
22
+ openpyxl
23
+ beautifulsoup4
24
+ requests
25
+ aiofiles
26
+ rank-bm25
27
+
28
+ # --- NLP + Text Processing ---
29
+ nltk
30
+ scikit-learn
31
+ regex
32
+ tqdm
33
+
34
+ # --- Web + Interface ---
35
+ huggingface-hub>=0.23.0
36
+ gradio
37
+ gradio-client
38
+ uvicorn
39
+ spaces
40
+ python-multipart
41
+
42
+ # --- Networking / Compatibility Fix ---
43
+ websockets>=12
runtime.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ python-3.10
runtime.yaml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =======================================
2
+ # ⚙️ Hugging Face Space Runtime Configuration
3
+ # =======================================
4
+
5
+ python: "3.10" # Stable for FAISS + Gradio + Transformers
6
+
7
+ # App entrypoint (FastAPI with Gradio mount)
8
+ entrypoint: "app:app"
9
+
10
+ hardware: "cpu-basic" # For small to medium FAISS indexes
11
+ # hardware: "cpu-upgrade" # Uncomment for larger index (>100 MB) or slower summaries
12
+
13
+ timeout: 600 # 10-minute build timeout
14
+ autoreload: true # Auto-reload app on file updates (optional)
15
+
16
+ # Cache persistent resources (prevents redownload)
17
+ cache:
18
+ - data/
19
+ - persistent/
20
+ - logs/
21
+
22
+ # Explicit build hook (optional, for clarity)
23
+ build:
24
+ commands:
25
+ - bash postBuild
26
+