fix(deploy): harden HF Space build — git, MLflow off, seed artifacts early
Browse filesThe previous build's truncated log showed pip install succeeded but
something later in the heavy pipeline-train RUN block aborted. Three
defensive changes so the next build either succeeds or fails with a
clearer diagnosis:
- apt-get install git: silences the MLflow 'Bad git executable' warning
and lets MLflow tag runs with a proper SHA when needed.
- NEUROBRIDGE_DISABLE_MLFLOW=1 prefixed on every build-time pipeline
invocation: avoids MLflow run-tagging fragility in the slim image
during build. The runtime entrypoint can re-enable MLflow if desired.
- Move 'python scripts/seed_demo_artifacts.py' BEFORE the pipeline
train block so the core showcase paths (MRI 2D / MRI ONNX / EEG joblib
/ clinical RAG / axial PNG) are guaranteed to land even if the BBB
classifier train or MRI ComBat pipeline trips. The seed step is also
re-run after RAG ingest (idempotent — only fills missing artifacts).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
- Dockerfile +16 -8
- Dockerfile.hf +16 -8
|
@@ -13,6 +13,7 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
|
|
| 13 |
# --- system deps for RDKit, nibabel, MNE ---
|
| 14 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 15 |
build-essential \
|
|
|
|
| 16 |
libgomp1 \
|
| 17 |
libxrender1 \
|
| 18 |
libsm6 \
|
|
@@ -40,17 +41,26 @@ COPY supervisord.conf ./supervisord.conf
|
|
| 40 |
COPY docker-entrypoint.sh ./docker-entrypoint.sh
|
| 41 |
RUN chmod +x /app/docker-entrypoint.sh
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
# Seed raw data from fixtures so the deployed Signal/Image/Molecule tabs
|
| 44 |
# work on first click. Then run all three pipelines so mlruns/ contains
|
| 45 |
# one run per modality — feeds /experiments/runs and the BBB provenance
|
| 46 |
# strip. data/raw/* is gitignored locally so we cannot COPY it.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
RUN mkdir -p data/raw data/processed && \
|
| 48 |
cp tests/fixtures/bbbp_sample.csv data/raw/bbbp.csv && \
|
| 49 |
cp tests/fixtures/eeg_sample.fif data/raw/eeg.fif && \
|
| 50 |
-
python -m src.pipelines.bbb_pipeline && \
|
| 51 |
-
python -m src.models.bbb_model && \
|
| 52 |
-
python -c "from pathlib import Path; from src.pipelines.eeg_pipeline import run_pipeline; run_pipeline(input_path=Path('tests/fixtures/eeg_sample.fif'), output_path=Path('data/processed/eeg_features.parquet'))" && \
|
| 53 |
-
python -c "from pathlib import Path; from src.pipelines.mri_pipeline import run_pipeline; run_pipeline(input_dir=Path('tests/fixtures/mri_sample'), sites_csv=Path('tests/fixtures/mri_sample/sites.csv'), output_path=Path('data/processed/mri_features.parquet'))"
|
| 54 |
|
| 55 |
# --- RAG knowledge base ingest ---
|
| 56 |
# Build the FAISS index from any seed docs in tests/fixtures/kb_sample/
|
|
@@ -60,10 +70,8 @@ RUN mkdir -p data/raw data/processed && \
|
|
| 60 |
COPY tests/fixtures/kb_sample/ ./data/knowledge_base/seed/
|
| 61 |
RUN python -m src.rag.ingest data/knowledge_base data/processed/faiss_index
|
| 62 |
|
| 63 |
-
# ---
|
| 64 |
-
#
|
| 65 |
-
# entrypoint also re-runs it on container start so a mounted-volume
|
| 66 |
-
# deployment can re-seed without a rebuild.
|
| 67 |
RUN python scripts/seed_demo_artifacts.py
|
| 68 |
|
| 69 |
# --- HF Spaces convention ---
|
|
|
|
| 13 |
# --- system deps for RDKit, nibabel, MNE ---
|
| 14 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 15 |
build-essential \
|
| 16 |
+
git \
|
| 17 |
libgomp1 \
|
| 18 |
libxrender1 \
|
| 19 |
libsm6 \
|
|
|
|
| 41 |
COPY docker-entrypoint.sh ./docker-entrypoint.sh
|
| 42 |
RUN chmod +x /app/docker-entrypoint.sh
|
| 43 |
|
| 44 |
+
# Seed demo artifacts FIRST so even if a heavier pipeline step fails, the
|
| 45 |
+
# core showcase paths (MRI 2D, MRI volumetric ONNX, EEG joblib, clinical
|
| 46 |
+
# RAG, axial PNG) still work. seed_demo_artifacts.py is idempotent.
|
| 47 |
+
RUN python scripts/seed_demo_artifacts.py
|
| 48 |
+
|
| 49 |
# Seed raw data from fixtures so the deployed Signal/Image/Molecule tabs
|
| 50 |
# work on first click. Then run all three pipelines so mlruns/ contains
|
| 51 |
# one run per modality — feeds /experiments/runs and the BBB provenance
|
| 52 |
# strip. data/raw/* is gitignored locally so we cannot COPY it.
|
| 53 |
+
#
|
| 54 |
+
# NEUROBRIDGE_DISABLE_MLFLOW=1 during build avoids MLflow run-tagging
|
| 55 |
+
# fragility in the slim image (no real .git tree to tag against). The
|
| 56 |
+
# entrypoint can re-run with MLflow on if desired.
|
| 57 |
RUN mkdir -p data/raw data/processed && \
|
| 58 |
cp tests/fixtures/bbbp_sample.csv data/raw/bbbp.csv && \
|
| 59 |
cp tests/fixtures/eeg_sample.fif data/raw/eeg.fif && \
|
| 60 |
+
NEUROBRIDGE_DISABLE_MLFLOW=1 python -m src.pipelines.bbb_pipeline && \
|
| 61 |
+
NEUROBRIDGE_DISABLE_MLFLOW=1 python -m src.models.bbb_model && \
|
| 62 |
+
NEUROBRIDGE_DISABLE_MLFLOW=1 python -c "from pathlib import Path; from src.pipelines.eeg_pipeline import run_pipeline; run_pipeline(input_path=Path('tests/fixtures/eeg_sample.fif'), output_path=Path('data/processed/eeg_features.parquet'))" && \
|
| 63 |
+
NEUROBRIDGE_DISABLE_MLFLOW=1 python -c "from pathlib import Path; from src.pipelines.mri_pipeline import run_pipeline; run_pipeline(input_dir=Path('tests/fixtures/mri_sample'), sites_csv=Path('tests/fixtures/mri_sample/sites.csv'), output_path=Path('data/processed/mri_features.parquet'))"
|
| 64 |
|
| 65 |
# --- RAG knowledge base ingest ---
|
| 66 |
# Build the FAISS index from any seed docs in tests/fixtures/kb_sample/
|
|
|
|
| 70 |
COPY tests/fixtures/kb_sample/ ./data/knowledge_base/seed/
|
| 71 |
RUN python -m src.rag.ingest data/knowledge_base data/processed/faiss_index
|
| 72 |
|
| 73 |
+
# --- Re-run demo-artifact seeding after RAG ingest in case any step above
|
| 74 |
+
# altered what's on disk. Idempotent — only fills missing artifacts.
|
|
|
|
|
|
|
| 75 |
RUN python scripts/seed_demo_artifacts.py
|
| 76 |
|
| 77 |
# --- HF Spaces convention ---
|
|
@@ -13,6 +13,7 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
|
|
| 13 |
# --- system deps for RDKit, nibabel, MNE ---
|
| 14 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 15 |
build-essential \
|
|
|
|
| 16 |
libgomp1 \
|
| 17 |
libxrender1 \
|
| 18 |
libsm6 \
|
|
@@ -40,17 +41,26 @@ COPY supervisord.conf ./supervisord.conf
|
|
| 40 |
COPY docker-entrypoint.sh ./docker-entrypoint.sh
|
| 41 |
RUN chmod +x /app/docker-entrypoint.sh
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
# Seed raw data from fixtures so the deployed Signal/Image/Molecule tabs
|
| 44 |
# work on first click. Then run all three pipelines so mlruns/ contains
|
| 45 |
# one run per modality — feeds /experiments/runs and the BBB provenance
|
| 46 |
# strip. data/raw/* is gitignored locally so we cannot COPY it.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
RUN mkdir -p data/raw data/processed && \
|
| 48 |
cp tests/fixtures/bbbp_sample.csv data/raw/bbbp.csv && \
|
| 49 |
cp tests/fixtures/eeg_sample.fif data/raw/eeg.fif && \
|
| 50 |
-
python -m src.pipelines.bbb_pipeline && \
|
| 51 |
-
python -m src.models.bbb_model && \
|
| 52 |
-
python -c "from pathlib import Path; from src.pipelines.eeg_pipeline import run_pipeline; run_pipeline(input_path=Path('tests/fixtures/eeg_sample.fif'), output_path=Path('data/processed/eeg_features.parquet'))" && \
|
| 53 |
-
python -c "from pathlib import Path; from src.pipelines.mri_pipeline import run_pipeline; run_pipeline(input_dir=Path('tests/fixtures/mri_sample'), sites_csv=Path('tests/fixtures/mri_sample/sites.csv'), output_path=Path('data/processed/mri_features.parquet'))"
|
| 54 |
|
| 55 |
# --- RAG knowledge base ingest ---
|
| 56 |
# Build the FAISS index from any seed docs in tests/fixtures/kb_sample/
|
|
@@ -60,10 +70,8 @@ RUN mkdir -p data/raw data/processed && \
|
|
| 60 |
COPY tests/fixtures/kb_sample/ ./data/knowledge_base/seed/
|
| 61 |
RUN python -m src.rag.ingest data/knowledge_base data/processed/faiss_index
|
| 62 |
|
| 63 |
-
# ---
|
| 64 |
-
#
|
| 65 |
-
# entrypoint also re-runs it on container start so a mounted-volume
|
| 66 |
-
# deployment can re-seed without a rebuild.
|
| 67 |
RUN python scripts/seed_demo_artifacts.py
|
| 68 |
|
| 69 |
# --- HF Spaces convention ---
|
|
|
|
| 13 |
# --- system deps for RDKit, nibabel, MNE ---
|
| 14 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 15 |
build-essential \
|
| 16 |
+
git \
|
| 17 |
libgomp1 \
|
| 18 |
libxrender1 \
|
| 19 |
libsm6 \
|
|
|
|
| 41 |
COPY docker-entrypoint.sh ./docker-entrypoint.sh
|
| 42 |
RUN chmod +x /app/docker-entrypoint.sh
|
| 43 |
|
| 44 |
+
# Seed demo artifacts FIRST so even if a heavier pipeline step fails, the
|
| 45 |
+
# core showcase paths (MRI 2D, MRI volumetric ONNX, EEG joblib, clinical
|
| 46 |
+
# RAG, axial PNG) still work. seed_demo_artifacts.py is idempotent.
|
| 47 |
+
RUN python scripts/seed_demo_artifacts.py
|
| 48 |
+
|
| 49 |
# Seed raw data from fixtures so the deployed Signal/Image/Molecule tabs
|
| 50 |
# work on first click. Then run all three pipelines so mlruns/ contains
|
| 51 |
# one run per modality — feeds /experiments/runs and the BBB provenance
|
| 52 |
# strip. data/raw/* is gitignored locally so we cannot COPY it.
|
| 53 |
+
#
|
| 54 |
+
# NEUROBRIDGE_DISABLE_MLFLOW=1 during build avoids MLflow run-tagging
|
| 55 |
+
# fragility in the slim image (no real .git tree to tag against). The
|
| 56 |
+
# entrypoint can re-run with MLflow on if desired.
|
| 57 |
RUN mkdir -p data/raw data/processed && \
|
| 58 |
cp tests/fixtures/bbbp_sample.csv data/raw/bbbp.csv && \
|
| 59 |
cp tests/fixtures/eeg_sample.fif data/raw/eeg.fif && \
|
| 60 |
+
NEUROBRIDGE_DISABLE_MLFLOW=1 python -m src.pipelines.bbb_pipeline && \
|
| 61 |
+
NEUROBRIDGE_DISABLE_MLFLOW=1 python -m src.models.bbb_model && \
|
| 62 |
+
NEUROBRIDGE_DISABLE_MLFLOW=1 python -c "from pathlib import Path; from src.pipelines.eeg_pipeline import run_pipeline; run_pipeline(input_path=Path('tests/fixtures/eeg_sample.fif'), output_path=Path('data/processed/eeg_features.parquet'))" && \
|
| 63 |
+
NEUROBRIDGE_DISABLE_MLFLOW=1 python -c "from pathlib import Path; from src.pipelines.mri_pipeline import run_pipeline; run_pipeline(input_dir=Path('tests/fixtures/mri_sample'), sites_csv=Path('tests/fixtures/mri_sample/sites.csv'), output_path=Path('data/processed/mri_features.parquet'))"
|
| 64 |
|
| 65 |
# --- RAG knowledge base ingest ---
|
| 66 |
# Build the FAISS index from any seed docs in tests/fixtures/kb_sample/
|
|
|
|
| 70 |
COPY tests/fixtures/kb_sample/ ./data/knowledge_base/seed/
|
| 71 |
RUN python -m src.rag.ingest data/knowledge_base data/processed/faiss_index
|
| 72 |
|
| 73 |
+
# --- Re-run demo-artifact seeding after RAG ingest in case any step above
|
| 74 |
+
# altered what's on disk. Idempotent — only fills missing artifacts.
|
|
|
|
|
|
|
| 75 |
RUN python scripts/seed_demo_artifacts.py
|
| 76 |
|
| 77 |
# --- HF Spaces convention ---
|