bee / scripts /deploy_hf_space.py
Bee Deploy
HF Space backend deploy [de0cba5]
5e21013
"""Deploy Bee backend to HF Space `cuilabs/bee` via curated git push.
The Space's Dockerfile only consumes a subset of the repo. Pushing the
full monorepo (apps/, packages/, docs/, tests/, supabase/, ...) bloats
the Space's git history with ~140k lines that the Docker build ignores.
This script builds a focused deploy by:
1. Resolving the current `master` commit SHA.
2. Copying ONLY the paths the Dockerfile needs into a temp dir.
3. Initialising a fresh git repo there, committing as
"HF Space backend deploy [<sha>]".
4. Force-pushing to the space remote's `main` branch β€” HF Spaces
build from the current tree, not the git history; force-push is
correct (no commit data is lost; the source of truth is GitHub).
5. Cleaning up the temp dir.
The Space rebuild starts automatically after the push (~2-10 min,
visible at https://huggingface.co/spaces/cuilabs/bee).
Usage:
python scripts/deploy_hf_space.py [--dry-run]
Authentication: the script reuses the credentials baked into the
`space` git remote (https://huggingface.co/spaces/cuilabs/bee). If
you've never pushed before, run `huggingface-cli login` first or set
HF_TOKEN in the environment so the http auth helper picks it up.
Curation list β€” kept in sync with the Dockerfile's COPY directives.
Update both when adding new runtime dependencies.
"""
from __future__ import annotations
import argparse
import os
import shutil
import subprocess
import sys
import tempfile
from pathlib import Path
ROOT = Path(__file__).resolve().parent.parent
# Files / dirs the Space's Dockerfile depends on. If you add a COPY in
# Dockerfile, also add the path here.
REQUIRED_PATHS = [
"Dockerfile",
"requirements.docker.txt",
"requirements.txt",
"README.md",
".env.example",
"pyproject.toml",
"bee",
"scripts",
]
# Optional β€” present locally during dev, shipped only if they exist.
# (Note: the chat UI moved to bee/static/ in 770a763, so a top-level
# `static/` is no longer expected; the bee/ copy covers it.)
OPTIONAL_PATHS = [
"data/datasets",
"data/rag_index",
"data/lora_checkpoints",
]
# Patterns to exclude when copying directories β€” keep the Space lean.
IGNORE = shutil.ignore_patterns(
"__pycache__",
"*.pyc",
"*.pyo",
".pytest_cache",
".DS_Store",
".mypy_cache",
".ruff_cache",
"*.log",
".venv",
"node_modules",
)
# HF rejects pushes containing files larger than this (10 MiB). The Space
# downloads its real artifacts (adapters, RAG indices) at runtime via
# bee/hub_sync.py from HF Hub β€” pre-baked large files are dev-only
# cruft that shouldn't be in the deploy.
MAX_FILE_SIZE = 10 * 1024 * 1024
SPACE_REMOTE = "https://huggingface.co/spaces/cuilabs/bee"
SPACE_BRANCH = "main" # confirmed via `git ls-remote space`
# HF Spaces require YAML frontmatter at the top of README.md to set
# the Space's config (sdk, port, title, etc.). Local README.md is the
# marketing-facing doc and intentionally has no frontmatter β€” we inject
# the Space-specific block at deploy time only.
#
# Without this, the Space lands in CONFIG_ERROR (cardData.sdk = None)
# because HF re-reads cardData from README on every push.
#
# app_port: 7860 is the HF Spaces default and what the runtime actually
# binds to regardless of what we set. The previous app_port: 8000 caused
# RUNTIME_ERROR β€” HF's reverse proxy probed :8000 forever, container was
# bound on :7860, healthcheck never reported healthy, Space killed at
# the 30-min watchdog deadline. Verified against actual run logs of the
# 5a22d328 deploy (2026-04-29).
HF_SPACE_FRONTMATTER = """---
title: Bee Intelligence Engine
emoji: 🐝
colorFrom: yellow
colorTo: gray
sdk: docker
app_port: 7860
pinned: true
license: apache-2.0
short_description: The Intelligence Engine β€” domain LoRA adapters
---
"""
def run(cmd: list[str], cwd: Path) -> subprocess.CompletedProcess[str]:
return subprocess.run(cmd, cwd=cwd, check=True, capture_output=True, text=True)
def main() -> None:
p = argparse.ArgumentParser()
p.add_argument("--dry-run", action="store_true",
help="build the deploy tree but skip the push")
args = p.parse_args()
sha = run(["git", "rev-parse", "--short", "HEAD"], cwd=ROOT).stdout.strip()
full_sha = run(["git", "rev-parse", "HEAD"], cwd=ROOT).stdout.strip()
branch = run(["git", "rev-parse", "--abbrev-ref", "HEAD"], cwd=ROOT).stdout.strip()
print(f"deploying {sha} (branch {branch}) to {SPACE_REMOTE}:{SPACE_BRANCH}")
with tempfile.TemporaryDirectory() as tmp:
tmp = Path(tmp)
# Copy required files / dirs. README.md gets the HF Space
# frontmatter prepended β€” local README has no frontmatter
# (it's a public-facing doc), but HF Spaces need YAML at the
# top to know sdk/app_port/etc.
for rel in REQUIRED_PATHS:
src = ROOT / rel
if not src.exists():
print(f" βœ— MISSING required path: {rel}")
sys.exit(2)
dst = tmp / rel
dst.parent.mkdir(parents=True, exist_ok=True)
if src.is_dir():
shutil.copytree(src, dst, ignore=IGNORE)
elif rel == "README.md":
# Inject HF Space frontmatter only if not already present.
content = src.read_text(encoding="utf-8")
if not content.lstrip().startswith("---"):
dst.write_text(HF_SPACE_FRONTMATTER + content, encoding="utf-8")
print(f" + {rel} (with injected HF frontmatter)")
continue
shutil.copy2(src, dst)
else:
shutil.copy2(src, dst)
print(f" + {rel}")
# Optional dirs only if they exist locally.
for rel in OPTIONAL_PATHS:
src = ROOT / rel
if src.exists():
dst = tmp / rel
dst.parent.mkdir(parents=True, exist_ok=True)
if src.is_dir():
shutil.copytree(src, dst, ignore=IGNORE)
else:
shutil.copy2(src, dst)
print(f" + {rel} (optional, present)")
else:
print(f" - {rel} (optional, not present, skipped)")
# Strip files >10 MiB β€” HF rejects them at push time. Real
# artifacts (large adapters, RAG indices) are downloaded at
# Space startup via bee/hub_sync.py; baking them in is dev cruft.
stripped: list[tuple[Path, int]] = []
for f in list(tmp.rglob("*")):
if f.is_file() and f.stat().st_size > MAX_FILE_SIZE:
stripped.append((f, f.stat().st_size))
f.unlink()
if stripped:
print(f"\n stripped {len(stripped)} file(s) larger than {MAX_FILE_SIZE // (1024 * 1024)} MiB:")
for f, size in stripped:
rel = f.relative_to(tmp)
print(f" - {rel} ({size / 1024 / 1024:.1f} MiB)")
if args.dry_run:
total = sum(1 for _ in tmp.rglob("*") if _.is_file())
size = sum(f.stat().st_size for f in tmp.rglob("*") if f.is_file())
print(f"\n[dry-run] {total} files, {size:,} bytes total. Skipping push.")
return
# Init a fresh git repo in tmp; force-push as the Space's main.
# Force is correct here: the Space's git is just a deploy
# surface β€” actual source-of-truth git history lives on GitHub.
run(["git", "init", "-q", "--initial-branch=main"], cwd=tmp)
run(["git", "config", "user.name", "Bee Deploy"], cwd=tmp)
run(["git", "config", "user.email", "ops@cuilabs.io"], cwd=tmp)
run(["git", "add", "-A"], cwd=tmp)
run(["git", "commit", "-q", "-m", f"HF Space backend deploy [{sha}]\n\nGitHub master: {full_sha}"], cwd=tmp)
run(["git", "remote", "add", "space", SPACE_REMOTE], cwd=tmp)
push = subprocess.run(
["git", "push", "--force", "space", f"main:{SPACE_BRANCH}"],
cwd=tmp, capture_output=True, text=True,
)
if push.returncode != 0:
print(f" push failed:\n{push.stderr}", file=sys.stderr)
sys.exit(push.returncode)
print(f"\n pushed β†’ {SPACE_REMOTE}:{SPACE_BRANCH}")
print(f" HF Space is rebuilding now. Verify at:")
print(f" https://huggingface.co/spaces/cuilabs/bee")
print(f" https://cuilabs-bee.hf.space/v1/adapters (404 β†’ still building)")
if __name__ == "__main__":
main()