hackathon / scripts /seed_demo_artifacts.py
mekosotto's picture
feat(deploy): demo-artifact seeding for Hugging Face Space build + entrypoint
c00e850
"""Seed demo artifacts so every showcase path works without external data.
Idempotent — skips any artifact that already exists. Safe to call during
Docker build OR at container start.
Generates:
- data/processed/mri_dl_2d/best_model.pt (random resnet18 4-class)
- data/processed/mri_model.onnx (dynamic-D/H/W ONNX, biased toward 'abnormal')
- data/processed/eeg_clf.joblib (synthetic-separable RandomForest)
- data/external_rag/index/rag_index.pkl (4-chunk synthetic clinical TF-IDF)
- tests/fixtures/mri_sample/subject_0_axial.png (axial slice from the bundled NIfTI)
"""
from __future__ import annotations
import sys
from pathlib import Path
def seed_mri_dl_2d() -> Path:
out = Path("data/processed/mri_dl_2d/best_model.pt")
if out.exists():
return out
out.parent.mkdir(parents=True, exist_ok=True)
import torch
from torchvision import models
model = models.resnet18(weights=None)
model.fc = torch.nn.Linear(model.fc.in_features, 4)
torch.save(model.state_dict(), str(out))
return out
def seed_mri_volumetric_onnx() -> Path:
out = Path("data/processed/mri_model.onnx")
if out.exists():
return out
out.parent.mkdir(parents=True, exist_ok=True)
import onnx
from onnx import TensorProto, helper
input_info = helper.make_tensor_value_info(
"input", TensorProto.FLOAT, [1, 1, "D", "H", "W"],
)
output_info = helper.make_tensor_value_info("logits", TensorProto.FLOAT, [1, 2])
value = helper.make_tensor("const_logits", TensorProto.FLOAT, [1, 2], [0.3, 2.1])
node = helper.make_node("Constant", inputs=[], outputs=["logits"], value=value)
graph = helper.make_graph([node], "demo_mri_classifier", [input_info], [output_info])
model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
model.ir_version = 10
onnx.save(model, str(out))
return out
def seed_eeg_clf() -> Path:
out = Path("data/processed/eeg_clf.joblib")
if out.exists():
return out
out.parent.mkdir(parents=True, exist_ok=True)
import joblib
import numpy as np
from sklearn.ensemble import RandomForestClassifier
rng = np.random.default_rng(0)
n_features = 16
X_ctrl = rng.normal(0.0, 1.0, size=(100, n_features))
X_alz = rng.normal(2.0, 1.0, size=(100, n_features))
X = np.vstack([X_ctrl, X_alz])
y = np.array([0] * 100 + [1] * 100)
clf = RandomForestClassifier(n_estimators=12, max_depth=6, random_state=0)
clf.fit(X, y)
joblib.dump(clf, str(out))
return out
def seed_clinical_rag_index() -> Path:
"""Tiny synthetic clinical TF-IDF index (4 chunks). Replace with the real
pre-built pickle to upgrade quality without code changes."""
out = Path("data/external_rag/index/rag_index.pkl")
if out.exists():
return out
out.parent.mkdir(parents=True, exist_ok=True)
import pickle
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from src.rag.clinical.types import ClinicalChunk
chunks = [
ClinicalChunk(0, "alzheimers_lifestyle.pdf", 1, 1,
"Aerobic exercise and Mediterranean diet are associated with reduced cognitive decline in older adults at risk for Alzheimer's disease."),
ClinicalChunk(1, "parkinsons_motor.pdf", 1, 1,
"Levodopa remains the most effective symptomatic treatment for motor symptoms of Parkinson's disease."),
ClinicalChunk(2, "alzheimers_mci.pdf", 2, 2,
"Mild cognitive impairment may progress to dementia; MMSE and MoCA are standard screening tools."),
ClinicalChunk(3, "parkinsons_nutrition.pdf", 1, 1,
"Dietary patterns rich in antioxidants and omega-3 fatty acids are linked to lower Parkinson's risk."),
]
vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(1, 2), min_df=1, norm="l2")
matrix = vectorizer.fit_transform([c.text for c in chunks])
payload = {
"created_at": datetime.now().isoformat(timespec="seconds"),
"source_dir": str(out.parent),
"chunk_words": 220,
"overlap_words": 45,
"chunks": chunks,
"vectorizer": vectorizer,
"matrix": matrix,
}
with out.open("wb") as f:
pickle.dump(payload, f)
return out
def seed_axial_png() -> Path:
"""Axial mid-slice PNG from the bundled NIfTI fixture for the Researcher tab."""
out = Path("tests/fixtures/mri_sample/subject_0_axial.png")
if out.exists():
return out
out.parent.mkdir(parents=True, exist_ok=True)
import nibabel as nib
import numpy as np
from PIL import Image
src = Path("tests/fixtures/mri_sample/subject_0.nii.gz")
vol = np.asarray(nib.load(str(src)).get_fdata(), dtype=np.float32)
mid = vol.shape[2] // 2
slc = vol[:, :, mid]
norm = (slc - slc.min()) / max(slc.max() - slc.min(), 1e-6)
Image.fromarray((norm * 255).astype(np.uint8), mode="L").save(str(out))
return out
def main() -> int:
seeds = [
("MRI 2D resnet18 state_dict", seed_mri_dl_2d),
("MRI volumetric ONNX", seed_mri_volumetric_onnx),
("EEG sklearn classifier", seed_eeg_clf),
("Clinical TF-IDF RAG index", seed_clinical_rag_index),
("Axial PNG fixture", seed_axial_png),
]
print("Seeding demo artifacts...", flush=True)
for name, fn in seeds:
try:
path = fn()
kb = path.stat().st_size // 1024 if path.is_file() else 0
print(f" OK {name:35s} {path} ({kb} KB)", flush=True)
except Exception as e:
print(f" FAIL {name}: {type(e).__name__}: {e}", flush=True)
return 1
print("Done.", flush=True)
return 0
if __name__ == "__main__":
sys.exit(main())