watermark_remover / scripts /mirror_checkpoints.py
Jack Wu
feat: introduce checkpoint mirroring script, strengthen video validation, and improve pipeline robustness for masking and compositing.
ebe8a5c
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "huggingface_hub>=0.26",
# "requests>=2.31",
# ]
# ///
"""
mirror_checkpoints.py
---------------------
One-off mirror job: copies the three model dependencies for the
Video Watermark Remover Space into JackIsNotInTheBox/Video_Watermark_Remover_Checkpoints
so the Space is insulated from upstream deletion.
Sources mirrored:
1. Wan-AI/Wan2.1-VACE-14B-diffusers (~75 GB, Apache-2.0) β†’ vace-14b/
2. lightx2v/Wan2.1-Distill-Loras (single LoRA file) β†’ loras/
3. big-lama.pt from GitHub releases (~196 MB, Apache-2.0) β†’ lama/
Strategy
--------
Per-file streaming: download β†’ upload β†’ delete. Disk usage at any moment
is ~one file (max ~5 GB for a single VACE transformer shard), so this fits
on cpu-basic / cpu-upgrade Jobs without ever holding the full 75 GB locally.
"""
import os
import sys
from pathlib import Path
import requests
from huggingface_hub import HfApi, hf_hub_download, list_repo_files
# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------
DEST_REPO = "JackIsNotInTheBox/Video_Watermark_Remover_Checkpoints"
TOKEN = os.environ.get("HF_TOKEN")
if not TOKEN:
sys.exit("HF_TOKEN env var not set; pass via `--secrets HF_TOKEN=...`")
WORK = Path("/tmp/mirror")
WORK.mkdir(parents=True, exist_ok=True)
api = HfApi(token=TOKEN)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def stream_repo(
src_repo: str,
dest_prefix: str,
src_type: str = "model",
exclude_globs: list[str] | None = None,
) -> None:
"""Mirror every file in src_repo under dest_prefix in DEST_REPO."""
files = list_repo_files(src_repo, repo_type=src_type, token=TOKEN)
exclude = exclude_globs or []
files = [f for f in files if not any(Path(f).match(g) for g in exclude)]
print(f"\n=== {src_repo} β†’ {dest_prefix}/ ({len(files)} files) ===", flush=True)
for i, fname in enumerate(files, 1):
print(f" [{i:>3}/{len(files)}] {fname}", flush=True)
local = hf_hub_download(
repo_id=src_repo,
repo_type=src_type,
filename=fname,
local_dir=str(WORK),
token=TOKEN,
)
api.upload_file(
path_or_fileobj=local,
path_in_repo=f"{dest_prefix}/{fname}",
repo_id=DEST_REPO,
repo_type="model",
commit_message=f"Mirror {src_repo}: {fname}",
)
Path(local).unlink(missing_ok=True)
def stream_url(url: str, dest_path_in_repo: str, commit_message: str) -> None:
"""Download a single file from an arbitrary URL, push to DEST_REPO, delete."""
fname = Path(dest_path_in_repo).name
print(f"\n=== {url} β†’ {dest_path_in_repo} ===", flush=True)
local = WORK / fname
with requests.get(url, stream=True, timeout=300) as r:
r.raise_for_status()
with open(local, "wb") as fp:
for chunk in r.iter_content(chunk_size=1 << 20): # 1 MB chunks
fp.write(chunk)
api.upload_file(
path_or_fileobj=str(local),
path_in_repo=dest_path_in_repo,
repo_id=DEST_REPO,
repo_type="model",
commit_message=commit_message,
)
local.unlink(missing_ok=True)
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main() -> None:
# 1. VACE-14B (largest β€” do first while disk is freshest)
stream_repo(
"Wan-AI/Wan2.1-VACE-14B-diffusers",
dest_prefix="vace-14b",
exclude_globs=["assets/*", ".gitattributes"],
)
# 2. 4-step distill LoRA (single file)
stream_repo(
"lightx2v/Wan2.1-Distill-Loras",
dest_prefix="loras",
exclude_globs=[
"*.md",
".gitattributes",
# Pull only the rank-64 t2v 4-step LoRA β€” matches vace.py 8-step plan
"*i2v*",
"*rank32*",
"*rank128*",
],
)
# 3. LaMa from GitHub release
stream_url(
url="https://github.com/enesmsahin/simple-lama-inpainting/releases/download/v0.1.0/big-lama.pt",
dest_path_in_repo="lama/big-lama.pt",
commit_message="Mirror big-lama.pt from simple-lama-inpainting v0.1.0 GitHub release",
)
print("\nβœ… All mirrors complete.")
if __name__ == "__main__":
main()