budget-router-openenv / scripts /hf_export_build_commit.py
Akshay Babbar
chore: HF Space export (size filter)
98a5a8c
#!/usr/bin/env python3
"""
Build a *new* commit (orphan) from the current `HEAD` tree, omitting:
- any binary blob (HF rejects plain-Git binaries unless Xet is used)
- any blob with size > MAX_BYTES (default 10 MiB)
- any subtree whose *sum* of **remaining** blob sizes under a directory prefix exceeds MAX_BYTES
(repeatedly removes the shortest over-budget prefix first)
Prints the new commit object id (40 hex) to stdout. By default, stderr prints a one-line
omit summary; set QUIET=1 to suppress it or VERBOSE=1 to list omitted paths.
Run from the repository root (or any path inside the repo). Uses a temporary
GIT_INDEX_FILE; does not modify the working tree or the current branch.
"""
from __future__ import annotations
import os
import subprocess
import sys
import tempfile
from collections import defaultdict
from dataclasses import dataclass
@dataclass(frozen=True)
class Blob:
path: str
size: int
mode: str
oid: str
def sh(args: list[str], cwd: str, env: dict[str, str] | None = None) -> str:
menv = {**os.environ, "LC_ALL": "C"} if env is None else env
return (
subprocess.check_output(
args,
cwd=cwd,
stderr=subprocess.DEVNULL,
env=menv,
).decode("utf-8", "replace")
).strip()
def list_blobs(cwd: str) -> list[Blob]:
out = sh(["git", "ls-tree", "-l", "-r", "HEAD"], cwd=cwd)
blobs: list[Blob] = []
for line in out.splitlines():
if "\t" not in line:
continue
meta, path = line.split("\t", 1)
toks = meta.split()
if len(toks) < 4:
continue
mode, otype, _oid, size_s = toks[0], toks[1], toks[2], toks[3]
if otype == "commit":
continue
if otype != "blob":
continue
blobs.append(Blob(path=path, size=int(size_s), mode=mode, oid=toks[2]))
return blobs
def is_binary_blob(cwd: str, oid: str) -> bool:
data = subprocess.check_output(
["git", "cat-file", "blob", oid],
cwd=cwd,
stderr=subprocess.DEVNULL,
)[:8192]
if b"\0" in data:
return True
try:
data.decode("utf-8")
except UnicodeDecodeError:
return True
return False
def parent_prefixes(path: str) -> list[str]:
if "/" not in path:
return []
parts = path.split("/")
return ["/".join(parts[:i]) for i in range(1, len(parts))]
def apply_rules(
cwd: str,
blobs: list[Blob],
max_b: int,
*,
verbose: bool,
) -> tuple[dict[str, Blob], dict[str, int]]:
"""Return keep map and omitted counts by reason."""
keep: dict[str, Blob] = {}
omitted = {"binary": 0, "file": 0, "subtree": 0}
for b in blobs:
if b.size > max_b:
omitted["file"] += 1
if verbose:
print(f"Omit: file {b.path!r} ({b.size} B) > {max_b} B", file=sys.stderr)
elif is_binary_blob(cwd, b.oid):
omitted["binary"] += 1
if verbose:
print(f"Omit: binary file {b.path!r}", file=sys.stderr)
else:
keep[b.path] = b
while True:
psum: dict[str, int] = defaultdict(int)
for p, b in keep.items():
for d in parent_prefixes(p):
psum[d] += b.size
bad = [d for d, t in psum.items() if t > max_b]
if not bad:
break
victim = min(bad, key=lambda d: (len(d.split("/")), d))
to_del = [p for p in keep if p == victim or p.startswith(victim + "/")]
for p in to_del:
del keep[p]
omitted["subtree"] += 1
if verbose:
print(
f"Omit: subtree {victim!r} (sum of kept blobs under prefix > {max_b} B)",
file=sys.stderr,
)
return keep, omitted
def make_export_commit(cwd: str, paths: list[str]) -> str:
if not paths:
raise SystemExit("error: nothing to commit after size filter (empty tree)")
empty = "4b825dc642cb6eb9a060e54bf8d69288fbee4904" # empty tree object
with tempfile.TemporaryDirectory() as tdir:
gidx = os.path.join(tdir, "i")
env: dict[str, str] = {**os.environ, "GIT_INDEX_FILE": gidx}
subprocess.check_call(
["git", "read-tree", empty], cwd=cwd, env=env, stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL
)
for p in paths:
r = subprocess.run(
["git", "ls-tree", "HEAD", "--", p],
cwd=cwd,
env=env,
capture_output=True,
text=True,
)
if r.returncode != 0 or not r.stdout.strip():
continue
line = r.stdout.strip()
mline, path_f = line.split("\t", 1)
p_use = path_f if path_f else p
toks = mline.split()
if len(toks) < 3:
continue
mode, otype, oid = toks[0], toks[1], toks[2]
if otype != "blob":
continue
subprocess.check_call(
["git", "update-index", "--add", "--cacheinfo", f"{mode},{oid},{p_use}"],
cwd=cwd,
env=env,
stdout=subprocess.DEVNULL,
)
tree = subprocess.check_output(["git", "write-tree"], cwd=cwd, env=env).decode().strip()
msg = os.environ.get("HF_EXPORT_MSG", "chore: HF Space export (size filter)")
commit = sh(["git", "commit-tree", tree, "-m", msg], cwd=cwd, env=env)
if len(commit) < 4:
raise SystemExit("error: could not build export commit")
return commit
def main() -> None:
top = sh(["git", "rev-parse", "--show-toplevel"], cwd=os.path.abspath("."))
if not top:
raise SystemExit("error: not a git repository")
cwd = top
max_b = int(os.environ.get("MAX_BYTES", str(10 * 1024 * 1024)))
verbose = os.environ.get("VERBOSE", "").lower() in ("1", "true", "yes", "y")
quiet = os.environ.get("QUIET", "0").lower() in ("1", "true", "yes", "y")
blobs = list_blobs(cwd)
_keep, omitted = apply_rules(cwd, blobs, max_b, verbose=verbose)
paths = sorted(_keep)
n_om = sum(omitted.values())
if not quiet and n_om:
print(
"HF export: omitted "
f"{n_om} path(s) "
f"(binary={omitted['binary']}, file>{max_b}B={omitted['file']}, "
f"subtree>{max_b}B={omitted['subtree']})",
file=sys.stderr,
)
oid = make_export_commit(cwd, paths)
print(oid, end="")
if __name__ == "__main__":
try:
main()
except subprocess.CalledProcessError as e: # pragma: no cover
print("error: git command failed", file=sys.stderr)
raise SystemExit(1) from e