Spaces:

essprasad
/

ClinicalTrialBasics

Running

App Files Files Community

essprasad commited on 13 days ago

Commit

7829d29

verified ·

1 Parent(s): 394a257

Upload 10 files

Browse files

Files changed (10) hide show

README.md +53 -0
app.py +359 -0
cleanup_space.py +135 -0
gitattributes +49 -0
gitignore +71 -0
lfsconfig +4 -0
postBuild +60 -0
requirements.txt +43 -0
runtime.txt +1 -0
runtime.yaml +26 -0

README.md ADDED Viewed

	@@ -0,0 +1,53 @@

+---
+title: Clinical Research Chatbot
+emoji: 🧪
+colorFrom: blue
+colorTo: green
+sdk: gradio
+sdk_version: 5.49.0
+app_file: app.py
+pinned: false
+---
+# 🧪 Clinical Research Chatbot
+A lightweight, fully open-source chatbot for clinical research professionals.
+Runs entirely on Hugging Face — no OpenAI dependency.
+---
+## ✅ Current Features
+### 💬 Chatbot Interface
+- Gradio UI with chatbot + Admin Tools tab.
+- Query pipeline: **FAQ → Glossary → Knowledge Base → APIs (PubMed → FDA → ClinicalTrials.gov)**.
+- Answers are clearly labeled by source.
+### 🔍 Knowledge Base (Docs + URLs)
+- Supports ingestion of: PDF, DOCX, TXT, XLSX, JSON, HTML.
+- Auto-ingests from:
+  - `/data/public_docs/`
+  - `/data/urls.txt`
+- Smart chunking optimized for glossary terms + long text.
+### 📦 Vector Search
+- FAISS + `all-MiniLM-L6-v2` embeddings.
+- Persistent storage:
+  - `/persistent/faiss.index`
+  - `/persistent/faiss.index.meta.json`
+- Index survives restarts and can be exported/imported as `.zip`.
+### 🌐 API Integrations
+- PubMed
+- FDA Drug Labels
+- ClinicalTrials.gov
+### 🧠 Query Handling
+- Glossary-aware normalization
+  *(e.g., eCRF, e-CRF, electronic case report form → same match)*
+- Glossary priority: if glossary hit exists → always returned first.
+- Answer flow: **FAQ → Glossary → KB → APIs**.
+- Clear section labels, citations, and confidence notes.
+### 📜 Logging
+All queries, answers, and sources saved in:

app.py ADDED Viewed

	@@ -0,0 +1,359 @@

+# ==========================================================
+# SAFE-MODE PRELAUNCH CLEANUP
+# ==========================================================
+import os
+import shutil
+import time
+import glob
+# Prevent Svelte/Gradio SSR locale warning early
+os.environ["GRADIO_LOCALE"] = "en"
+def _prelaunch_cleanup(threshold_gb: float = 45.0):
+    """Pre-clean to avoid HF Spaces eviction while being conservative about persistent data."""
+    def _used_gb(path="/home/user/app"):
+        try:
+            total, used, free = shutil.disk_usage(path)
+            return round(min(used / (1024**3), 49.9), 2)
+        except Exception:
+            return 0.0
+    used = _used_gb()
+    print(f"\n💾 Startup disk usage: {used:.2f} GB")
+    # Only perform aggressive cleanup when over threshold.
+    if used > threshold_gb:
+        print(f"⚠️ Usage {used:.2f} GB > {threshold_gb} GB — performing aggressive cleanup.")
+        # preserve persistent / important artifacts by default
+        preserve = {"faiss.index", "faiss.index.meta.json", "glossary.json"}
+        for folder in ["/home/user/app/data/docs_cache", "/home/user/app/tmp_docs"]:
+            if os.path.exists(folder):
+                for f in glob.glob(os.path.join(folder, "*")):
+                    name = os.path.basename(f)
+                    if name in preserve:
+                        continue
+                    try:
+                        if os.path.isdir(f):
+                            shutil.rmtree(f, ignore_errors=True)
+                        else:
+                            os.remove(f)
+                    except Exception:
+                        pass
+        print("🧹 Aggressive cleanup complete.")
+    print(f"✨ Disk after cleanup: {_used_gb():.2f} GB\n")
+_prelaunch_cleanup()
+# ==========================================================
+# MAIN APP — Clinical Trial Chatbot
+# ==========================================================
+import gradio as gr
+from core.hybrid_retriever import summarize_combined
+APP_TITLE = "🧠 Clinical Research Chatbot"
+APP_DESC = (
+    "Ask any clinical research or GCP-related question. "
+    "Retrieves and summarizes from ICH, GCDMP, EMA, FDA, Excel, and Web datasets."
+)
+# ----------------------------------------------------------
+# MODE & CREDENTIALS
+# ----------------------------------------------------------
+PUBLIC_MODE = os.environ.get("PUBLIC_MODE", "true").lower() == "true"
+ADMIN_USER = os.environ.get("ADMIN_USER", "admin")
+ADMIN_PASS = os.environ.get("ADMIN_PASS", "changeme")
+print(f"🔐 Running in {'PUBLIC' if PUBLIC_MODE else 'ADMIN'} mode.")
+print(f"🌍 Locale set to: {os.environ.get('GRADIO_LOCALE','en')}")
+print(f"🧩 Env vars loaded: PUBLIC_MODE={PUBLIC_MODE}, ADMIN_USER={ADMIN_USER}")
+# ----------------------------------------------------------
+# AUTH HELPER
+# ----------------------------------------------------------
+def check_admin_login(username, password):
+    return username == ADMIN_USER and password == ADMIN_PASS
+# ----------------------------------------------------------
+# MAINTENANCE FUNCTIONS
+# ----------------------------------------------------------
+import json
+import faiss
+import pandas as pd
+import numpy as np
+import shutil as _shutil  # alias to avoid shadowed name
+from sentence_transformers import SentenceTransformer
+from core.vector_sync import rebuild_faiss_from_glossary, _upload_to_dataset
+from huggingface_hub import hf_hub_download, list_repo_files
+DATA_PATHS = [
+    "/home/user/app/persistent/faiss.index",
+    "/home/user/app/persistent/faiss.index.meta.json",
+    "/home/user/app/data/docs_cache",
+]
+def clear_index():
+    removed = []
+    for p in DATA_PATHS:
+        if os.path.isdir(p):
+            _shutil.rmtree(p, ignore_errors=True)
+            removed.append(f"🗑️ Deleted folder: {p}")
+        elif os.path.exists(p):
+            os.remove(p)
+            removed.append(f"🗑️ Deleted file: {p}")
+    msg = "\n".join(removed) if removed else "ℹ️ No cache files found."
+    print(msg)
+    return msg
+def rebuild_index():
+    """Rebuild FAISS index from glossary + Excel + web."""
+    try:
+        import os
+        import json
+        import pandas as pd
+        import faiss
+        import numpy as np
+        from sentence_transformers import SentenceTransformer
+        from core.web_loader import web_crawler_loader  # may raise; handled below
+        repo_id_index = "essprasad/CT-Chat-Index"
+        repo_id_docs = "essprasad/CT-Chat-Docs"
+        local_dir = "/home/user/app/persistent"
+        os.makedirs(local_dir, exist_ok=True)
+        print("🧠 Rebuilding FAISS index (Glossary + Excel + Web)…")
+        # --- Ensure glossary.json exists (download if missing)
+        glossary_path = os.path.join(local_dir, "glossary.json")
+        if not os.path.exists(glossary_path):
+            try:
+                print("📥 glossary.json missing locally — downloading from HF index dataset...")
+                downloaded = hf_hub_download(repo_id=repo_id_index, filename="persistent/glossary.json", repo_type="dataset")
+                # copy to local persistent path
+                _shutil.copy2(downloaded, glossary_path)
+                print("✅ Downloaded glossary.json.")
+            except Exception as e:
+                print(f"⚠️ Could not download glossary.json: {e}. Proceeding if available in other sources.")
+        # Rebuild FAISS from glossary (this returns an index object and metadata list)
+        index, metas = rebuild_faiss_from_glossary(glossary_path=glossary_path)
+        print(f"📘 Loaded {len(metas)} glossary entries.")
+        # --- 3️⃣ Index Excel (MRCT Glossary)
+        print("📑 Scanning Excel files in dataset…")
+        repo_files = list_repo_files(repo_id_docs, repo_type="dataset")
+        excel_files = [f for f in repo_files if f.lower().endswith((".xlsx", ".xls"))]
+        model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+        excel_entries = []
+        for file_name in excel_files:
+            print(f"📄 Reading {file_name}…")
+            try:
+                path = hf_hub_download(repo_id_docs, filename=file_name, repo_type="dataset")
+                xls = pd.read_excel(path, sheet_name=None)
+                for sheet, df in xls.items():
+                    if "Glossary Term" not in df.columns:
+                        continue
+                    df = df.fillna("").dropna(how="all")
+                    for _, row in df.iterrows():
+                        term = str(row.get("Glossary Term", "")).strip()
+                        if not term:
+                            continue
+                        # Combine all the relevant MRCT fields
+                        combined_text = (
+                            f"Glossary Term: {term}\n"
+                            f"Glossary Definition: {row.get('Glossary Definition','')}\n"
+                            f"Use in Context: {row.get('Use in Context','')}\n"
+                            f"More Info: {row.get('More Info','')}\n"
+                            f"Other Info to Think About When Joining a Study: {row.get('Other Info to Think About When Joining a Study','')}\n"
+                            f"Related Terms: {row.get('Related Terms','')}\n"
+                            f"Other Resources: {row.get('Other Resources','')}\n"
+                            f"Term URL: {row.get('Term URL','')}\n"
+                            f"CDISC/NCI URL: {row.get('CDISC/NCI URL','')}\n"
+                            f"Version: {row.get('Version','')}"
+                        ).strip()
+                        excel_entries.append({
+                            "source": file_name,
+                            "sheet": sheet,
+                            "term": term,
+                            "type": "Excel",
+                            "file": file_name,
+                            "text": combined_text
+                        })
+            except Exception as e:
+                print(f"⚠️ Error reading {file_name}: {e}")
+        if excel_entries:
+            texts = [e["text"] for e in excel_entries]
+            embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True).astype("float32")
+            faiss.normalize_L2(embeddings)
+            index.add(embeddings)
+            metas.extend(excel_entries)
+            print(f"✅ Added {len(excel_entries)} Excel entries to FAISS.")
+        # ---- Optional: Load web content (may be slow)
+        try:
+            print("🌐 Loading and embedding web sources…")
+            web_entries = web_crawler_loader(
+                urls_file="/home/user/app/data/urls.txt",
+                cache_path="/home/user/app/persistent/web_cache.json",
+                max_pages=3,
+                timeout=20,
+                force_refresh=False,
+            )
+            if web_entries:
+                web_entries = [e for e in web_entries if len(e.get("text", "")) > 200]
+                print(f"✅ Retrieved {len(web_entries)} web entries.")
+                web_texts = [e["text"] for e in web_entries]
+                web_emb = model.encode(web_texts, show_progress_bar=True, convert_to_numpy=True).astype("float32")
+                faiss.normalize_L2(web_emb)
+                index.add(web_emb)
+                metas.extend(web_entries)
+                print("✅ Web content added to FAISS.")
+        except Exception as e:
+            print(f"⚠️ Web content embedding failed: {e}")
+        # --- Save index + meta locally
+        faiss_path = os.path.join(local_dir, "faiss.index")
+        meta_path = os.path.join(local_dir, "faiss.index.meta.json")
+        faiss.write_index(index, faiss_path)
+        with open(meta_path, "w", encoding="utf-8") as f:
+            json.dump(metas, f, indent=2)
+        print(f"💾 Local FAISS saved ({len(metas)} entries).")
+        # --- Upload artifacts back to HF dataset (best-effort)
+        try:
+            _upload_to_dataset(faiss_path, meta_path, repo_id_index)
+            print(f"☁️ Uploaded FAISS ({len(metas)} entries) to {repo_id_index}.")
+        except Exception as e:
+            print(f"⚠️ Upload failed: {e}")
+        return f"✅ Rebuild complete: {len(metas)} entries (Glossary + Excel + Web)."
+    except Exception as e:
+        return f"⚠️ Rebuild failed: {e}"
+def rebuild_glossary():
+    try:
+        from core.glossary_builder import rebuild_and_upload
+        rebuild_and_upload()
+        return "✅ Glossary rebuilt and uploaded successfully."
+    except Exception as e:
+        return f"⚠️ Glossary rebuild failed: {e}"
+def reset_faiss_cache():
+    """
+    Completely clears local FAISS and glossary caches, reloads the vector_store module
+    (to wipe in-memory runtime caches), then rebuilds glossary + index.
+    """
+    try:
+        # Use the clear helper from core.vector_store if available
+        from importlib import reload
+        from core import vector_store
+        # If vector_store exposes clear_local_faiss, use it (safe and logged)
+        if hasattr(vector_store, "clear_local_faiss"):
+            vector_store.clear_local_faiss()
+        else:
+            # fallback: manually delete persistent/runtime files
+            paths = [
+                "/home/user/app/persistent/faiss.index",
+                "/home/user/app/persistent/faiss.index.meta.json",
+                "/home/user/app/persistent/glossary.json",
+                "/home/user/app/runtime_faiss",
+            ]
+            for p in paths:
+                if os.path.exists(p):
+                    try:
+                        if os.path.isdir(p):
+                            _shutil.rmtree(p, ignore_errors=True)
+                        else:
+                            os.remove(p)
+                        print(f"🗑️ Deleted: {p}")
+                    except Exception:
+                        pass
+        # reload the module to clear any in-memory caches
+        reload(vector_store)
+        print("♻️ FAISS runtime module reloaded to ensure fresh index rebuild.")
+        msg = "🧹 Local FAISS + glossary cache cleared. Starting full rebuild...\n\n"
+        msg += rebuild_glossary() + "\n"
+        msg += rebuild_index()
+        return msg
+    except Exception as e:
+        return f"⚠️ Reset failed: {e}"
+# ----------------------------------------------------------
+# CHATBOT CORE
+# ----------------------------------------------------------
+def chat_answer(query, mode="short"):
+    try:
+        if not query or not str(query).strip():
+            return "<i>⚠️ Please enter a valid query.</i>"
+        return summarize_combined(str(query).strip(), mode=mode)
+    except Exception as e:
+        print("❌ Chatbot error:", e)
+        return f"<i>⚠️ Error: {e}</i>"
+# ----------------------------------------------------------
+# GRADIO UI
+# ----------------------------------------------------------
+with gr.Blocks(theme="gradio/soft") as demo:
+    gr.Markdown(f"# {APP_TITLE}")
+    gr.Markdown(APP_DESC)
+    query_box = gr.Textbox(
+        label="Ask your clinical trial question",
+        placeholder="e.g. What is an eCRF?",
+        lines=2,
+    )
+    output_box = gr.HTML(label="Answer")
+    with gr.Row():
+        submit_btn = gr.Button("🚀 Submit", variant="primary")
+        if not PUBLIC_MODE:
+            rebuild_btn = gr.Button("🔁 Rebuild Index")
+            rebuild_glossary_btn = gr.Button("📘 Rebuild Glossary")
+            reset_btn = gr.Button("🧹 Reset FAISS Cache (Full Rebuild)")
+            clear_btn = gr.Button("🗑️ Clear Index Only")
+    submit_btn.click(fn=chat_answer, inputs=[query_box], outputs=output_box)
+    query_box.submit(fn=chat_answer, inputs=[query_box], outputs=output_box)
+    if not PUBLIC_MODE:
+        rebuild_btn.click(fn=rebuild_index, outputs=output_box)
+        rebuild_glossary_btn.click(fn=rebuild_glossary, outputs=output_box)
+        reset_btn.click(fn=reset_faiss_cache, outputs=output_box)
+        clear_btn.click(fn=clear_index, outputs=output_box)
+# ----------------------------------------------------------
+# LAUNCH APP
+# ----------------------------------------------------------
+if __name__ == "__main__":
+    print("🚀 Starting Clinical Trial Chatbot…")
+    print("🧠 Initializing retriever warm-up…")
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        auth=check_admin_login if not PUBLIC_MODE else None,
+        ssr_mode=False,
+    )

cleanup_space.py ADDED Viewed

	@@ -0,0 +1,135 @@

+"""
+cleanup_space.py
+----------------
+Maintenance script for Hugging Face Space cleanup.
+- Removes caches, temp files, and large unneeded assets.
+- Keeps only FAISS index + metadata + glossary.
+- Reuploads them to CT-Chat-Index dataset.
+"""
+import subprocess
+subprocess.run(["python", "cleanup_space.py"], check=False)
+import os
+import shutil
+import time
+from datetime import datetime
+from huggingface_hub import HfApi, upload_file, HfFolder
+# 🔧 Configuration
+REPO_ID = "essprasad/CT-Chat-Index"   # Dataset repo
+REPO_TYPE = "dataset"
+PERSISTENT_DIR = "persistent"
+DATA_DIR = "data"
+KEEP_FILES = [
+    "persistent/faiss.index",
+    "persistent/faiss.index.meta.json",
+    "data/glossary.json"
+]
+api = HfApi()
+token = HfFolder.get_token() or os.getenv("HF_TOKEN", None)
+def readable_size(path):
+    """Return human-readable folder size."""
+    total = 0
+    for dirpath, _, filenames in os.walk(path):
+        for f in filenames:
+            fp = os.path.join(dirpath, f)
+            if os.path.exists(fp):
+                total += os.path.getsize(fp)
+    for unit in ["B", "KB", "MB", "GB"]:
+        if total < 1024.0:
+            return f"{total:.2f} {unit}"
+        total /= 1024.0
+    return f"{total:.2f} TB"
+# --------------------------------------------------------------------
+# 1. Clean caches, logs, temp files
+# --------------------------------------------------------------------
+def clean_temp_and_cache():
+    print("🧹 Cleaning temporary and cache directories...")
+    for path in ["/root/.cache", "/home/user/.cache", "/tmp"]:
+        shutil.rmtree(path, ignore_errors=True)
+        os.makedirs(path, exist_ok=True)
+    # Remove logs larger than 5 MB
+    log_dir = "logs"
+    if os.path.exists(log_dir):
+        for f in os.listdir(log_dir):
+            fp = os.path.join(log_dir, f)
+            if os.path.isfile(fp) and os.path.getsize(fp) > 5 * 1024 * 1024:
+                os.remove(fp)
+                print(f"🗑️ Removed oversized log: {fp}")
+# --------------------------------------------------------------------
+# 2. Remove large documents & orphan files
+# --------------------------------------------------------------------
+def trim_data():
+    print("📦 Trimming large files from data/public_docs...")
+    doc_dir = os.path.join(DATA_DIR, "public_docs")
+    if not os.path.exists(doc_dir):
+        return
+    for root, _, files in os.walk(doc_dir):
+        for f in files:
+            fp = os.path.join(root, f)
+            if os.path.getsize(fp) > 10 * 1024 * 1024:  # >10MB
+                print(f"🗑️ Removing large doc: {fp}")
+                os.remove(fp)
+# --------------------------------------------------------------------
+# 3. Verify and keep only essential files
+# --------------------------------------------------------------------
+def preserve_key_files():
+    print("🔒 Preserving essential files (index + glossary)...")
+    all_keep = []
+    for f in KEEP_FILES:
+        if os.path.exists(f):
+            print(f"✅ Keeping: {f}")
+            all_keep.append(f)
+        else:
+            print(f"⚠️ Missing expected file: {f}")
+    return all_keep
+# --------------------------------------------------------------------
+# 4. Upload cleaned files to dataset
+# --------------------------------------------------------------------
+def upload_to_hub(files):
+    if not token:
+        print("❌ No HF token found. Please add HF_TOKEN with write access.")
+        return
+    print(f"🚀 Uploading cleaned files to {REPO_ID} ...")
+    for f in files:
+        try:
+            upload_file(
+                path_or_fileobj=f,
+                path_in_repo=f,
+                repo_id=REPO_ID,
+                repo_type=REPO_TYPE,
+                token=token,
+                commit_message=f"Auto-cleanup sync {datetime.utcnow().isoformat()}"
+            )
+            print(f"✅ Uploaded: {f}")
+        except Exception as e:
+            print(f"⚠️ Failed to upload {f}: {e}")
+# --------------------------------------------------------------------
+# 5. Disk usage report
+# --------------------------------------------------------------------
+def report_usage():
+    print("\n📊 Disk Usage Summary:")
+    for path in ["persistent", "data", "/home/user"]:
+        if os.path.exists(path):
+            print(f"{path}: {readable_size(path)}")
+# --------------------------------------------------------------------
+# Run everything
+# --------------------------------------------------------------------
+if __name__ == "__main__":
+    start = time.time()
+    print("===== 🧹 Starting Space Cleanup =====")
+    clean_temp_and_cache()
+    trim_data()
+    files = preserve_key_files()
+    upload_to_hub(files)
+    report_usage()
+    print(f"\n✅ Cleanup finished in {time.time() - start:.2f}s")

gitattributes ADDED Viewed

	@@ -0,0 +1,49 @@

+# ================================================
+# ⚙️ Clinical Research Chatbot – Simplified .gitattributes
+# ================================================
+# Version: Safe for Hugging Face UI-only management
+# (no Git LFS required)
+# --------------------------------
+# --------------------------------
+# Code & Config Files (text mode)
+# --------------------------------
+*.py     text eol=lf
+*.txt    text eol=lf
+*.md     text eol=lf
+*.json   text eol=lf
+*.csv    text eol=lf
+*.yaml   text eol=lf
+*.yml    text eol=lf
+*.html   text eol=lf
+*.css    text eol=lf
+*.js     text eol=lf
+*.ini    text eol=lf
+*.cfg    text eol=lf
+*.toml   text eol=lf
+requirements.txt text eol=lf
+runtime.txt text eol=lf
+runtime.yaml text eol=lf
+*.gitignore text eol=lf
+*.gitattributes text eol=lf
+# --------------------------------
+# Binary & Data Files (no LFS)
+# --------------------------------
+*.pdf    binary
+*.docx   binary
+*.xlsx   binary
+*.zip    binary
+*.ppt    binary
+*.odt    binary
+*.png    binary
+*.jpg    binary
+*.jpeg   binary
+*.tif    binary
+*.tiff   binary
+*.gif    binary
+# --------------------------------
+# Default handling
+# --------------------------------
+* text=auto eol=lf

gitignore ADDED Viewed

	@@ -0,0 +1,71 @@

+# =========================================
+# 🧪 Clinical Research Chatbot – .gitignore
+# =========================================
+# -------------------------
+# Python
+# -------------------------
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+*.pkl
+*.pickle
+# -------------------------
+# Environment / virtualenv
+# -------------------------
+.venv/
+env/
+venv/
+ENV/
+*.env
+# -------------------------
+# Data & Logs
+# -------------------------
+logs/*
+!logs/.gitkeep
+!logs/query_log.csv     # keep recent chatbot logs
+# -------------------------
+# Data Folders
+# -------------------------
+# Keep reference docs & FAQs, ignore temporary files
+data/public_docs/*
+!data/public_docs/.gitkeep
+data/faq/*
+!data/faq/.gitkeep
+# Glossary and metadata files should stay (important for chatbot)
+!data/glossary.json
+!data/faq_data.json
+!data/clinical_faq.json
+# Ignore temporary FAISS or index rebuilds
+persistent/*
+!persistent/.gitkeep
+!persistent/faiss.index
+!persistent/faiss.index.meta.json
+# -------------------------
+# Hugging Face + Transformers cache
+# -------------------------
+.cache/
+datasets/
+transformers_cache/
+.huggingface/
+# -------------------------
+# IDE / Editor
+# -------------------------
+.vscode/
+.idea/
+.DS_Store
+# -------------------------
+# Miscellaneous
+# -------------------------
+*.tmp
+*.bak

lfsconfig ADDED Viewed

	@@ -0,0 +1,4 @@

+[lfs]
+url = https://huggingface.co/
+locksverify = true
+batch = true

postBuild ADDED Viewed

	@@ -0,0 +1,60 @@

+#!/bin/bash
+set -e
+echo "🔧 PostBuild starting — optimizing CT-Chat Space..."
+# -------------------------------------------------------
+# 1️⃣ Fix dependency mismatches (Gradio & Websockets)
+# -------------------------------------------------------
+pip install --force-reinstall --no-cache-dir "websockets>=12" "gradio-client>=1.3.0"
+# -------------------------------------------------------
+# 2️⃣ Create and register shared NLTK data directory
+# -------------------------------------------------------
+echo "📁 Preparing shared NLTK data directory..."
+export NLTK_DATA="/usr/local/share/nltk_data"
+mkdir -p $NLTK_DATA
+chmod -R 777 $NLTK_DATA
+# -------------------------------------------------------
+# 3️⃣ Preload all required NLTK resources (including punkt_tab)
+# -------------------------------------------------------
+echo "📦 Downloading NLTK resources..."
+python -m nltk.downloader -d $NLTK_DATA \
+    punkt punkt_tab averaged_perceptron_tagger averaged_perceptron_tagger_eng stopwords wordnet omw-1.4
+# -------------------------------------------------------
+# 4️⃣ Verify NLTK installs and paths
+# -------------------------------------------------------
+python - <<'PYCODE'
+import nltk, os
+print(f"NLTK data path → {nltk.data.path}")
+for pkg in ["punkt", "punkt_tab", "averaged_perceptron_tagger_eng", "stopwords", "wordnet"]:
+    try:
+        nltk.data.find(pkg)
+        print(f"✅ Verified NLTK resource: {pkg}")
+    except LookupError:
+        print(f"⚠️ Missing NLTK resource: {pkg}")
+PYCODE
+# -------------------------------------------------------
+# 5️⃣ Clean caches (stay <50GB)
+# -------------------------------------------------------
+echo "🧹 Cleaning Hugging Face + Torch caches..."
+rm -rf /root/.cache/* || true
+rm -rf /home/user/.cache/* || true
+rm -rf /usr/local/share/nltk_data/taggers/__pycache__ || true
+rm -rf /home/user/app/hf_cache/* || true
+rm -rf /home/user/app/logs/* || true
+# -------------------------------------------------------
+# 6️⃣ Ensure writable temporary cache for runtime
+# -------------------------------------------------------
+echo "📦 Preparing /tmp/hf_cache..."
+mkdir -p /tmp/hf_cache
+chmod -R 777 /tmp/hf_cache
+# -------------------------------------------------------
+# ✅ Done
+# -------------------------------------------------------
+echo "✅ PostBuild completed successfully — NLTK preloaded (punkt_tab OK), cache ready at /tmp/hf_cache."

requirements.txt ADDED Viewed

	@@ -0,0 +1,43 @@

+# =======================================
+# 🧪 Clinical Research Chatbot Requirements
+# =======================================
+# --- Core Libraries ---
+faiss-cpu
+torch
+transformers
+sentence-transformers
+sentencepiece
+fastapi
+whoosh
+# --- Data Handling ---
+numpy
+pandas
+datasets
+# --- Document Parsing ---
+pymupdf
+python-docx
+openpyxl
+beautifulsoup4
+requests
+aiofiles
+rank-bm25
+# --- NLP + Text Processing ---
+nltk
+scikit-learn
+regex
+tqdm
+# --- Web + Interface ---
+huggingface-hub>=0.23.0
+gradio
+gradio-client
+uvicorn
+spaces
+python-multipart
+# --- Networking / Compatibility Fix ---
+websockets>=12

runtime.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ python-3.10

runtime.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+# =======================================
+# ⚙️ Hugging Face Space Runtime Configuration
+# =======================================
+python: "3.10"               # Stable for FAISS + Gradio + Transformers
+# App entrypoint (FastAPI with Gradio mount)
+entrypoint: "app:app"
+hardware: "cpu-basic"        # For small to medium FAISS indexes
+# hardware: "cpu-upgrade"    # Uncomment for larger index (>100 MB) or slower summaries
+timeout: 600                 # 10-minute build timeout
+autoreload: true             # Auto-reload app on file updates (optional)
+# Cache persistent resources (prevents redownload)
+cache:
+  - data/
+  - persistent/
+  - logs/
+# Explicit build hook (optional, for clarity)
+build:
+  commands:
+    - bash postBuild