Spaces:

JackIsNotInTheBox
/

watermark_remover

Paused

watermark_remover / scripts /mirror_checkpoints.py

Jack Wu

feat: introduce checkpoint mirroring script, strengthen video validation, and improve pipeline robustness for masking and compositing.

ebe8a5c 9 days ago

raw

history blame contribute delete

4.61 kB

	# /// script
	# requires-python = ">=3.10"
	# dependencies = [
	# "huggingface_hub>=0.26",
	# "requests>=2.31",
	# ]
	# ///
	"""
	mirror_checkpoints.py
	---------------------
	One-off mirror job: copies the three model dependencies for the
	Video Watermark Remover Space into JackIsNotInTheBox/Video_Watermark_Remover_Checkpoints
	so the Space is insulated from upstream deletion.

	Sources mirrored:
	1. Wan-AI/Wan2.1-VACE-14B-diffusers (~75 GB, Apache-2.0) → vace-14b/
	2. lightx2v/Wan2.1-Distill-Loras (single LoRA file) → loras/
	3. big-lama.pt from GitHub releases (~196 MB, Apache-2.0) → lama/

	Strategy
	--------
	Per-file streaming: download → upload → delete. Disk usage at any moment
	is ~one file (max ~5 GB for a single VACE transformer shard), so this fits
	on cpu-basic / cpu-upgrade Jobs without ever holding the full 75 GB locally.
	"""

	import os
	import sys
	from pathlib import Path

	import requests
	from huggingface_hub import HfApi, hf_hub_download, list_repo_files

	# ---------------------------------------------------------------------------
	# Config
	# ---------------------------------------------------------------------------
	DEST_REPO = "JackIsNotInTheBox/Video_Watermark_Remover_Checkpoints"
	TOKEN = os.environ.get("HF_TOKEN")
	if not TOKEN:
	sys.exit("HF_TOKEN env var not set; pass via `--secrets HF_TOKEN=...`")

	WORK = Path("/tmp/mirror")
	WORK.mkdir(parents=True, exist_ok=True)

	api = HfApi(token=TOKEN)


	# ---------------------------------------------------------------------------
	# Helpers
	# ---------------------------------------------------------------------------
	def stream_repo(
	src_repo: str,
	dest_prefix: str,
	src_type: str = "model",
	exclude_globs: list[str] \| None = None,
	) -> None:
	"""Mirror every file in src_repo under dest_prefix in DEST_REPO."""
	files = list_repo_files(src_repo, repo_type=src_type, token=TOKEN)
	exclude = exclude_globs or []
	files = [f for f in files if not any(Path(f).match(g) for g in exclude)]
	print(f"\n=== {src_repo} → {dest_prefix}/ ({len(files)} files) ===", flush=True)

	for i, fname in enumerate(files, 1):
	print(f" [{i:>3}/{len(files)}] {fname}", flush=True)
	local = hf_hub_download(
	repo_id=src_repo,
	repo_type=src_type,
	filename=fname,
	local_dir=str(WORK),
	token=TOKEN,
	)
	api.upload_file(
	path_or_fileobj=local,
	path_in_repo=f"{dest_prefix}/{fname}",
	repo_id=DEST_REPO,
	repo_type="model",
	commit_message=f"Mirror {src_repo}: {fname}",
	)
	Path(local).unlink(missing_ok=True)


	def stream_url(url: str, dest_path_in_repo: str, commit_message: str) -> None:
	"""Download a single file from an arbitrary URL, push to DEST_REPO, delete."""
	fname = Path(dest_path_in_repo).name
	print(f"\n=== {url} → {dest_path_in_repo} ===", flush=True)
	local = WORK / fname
	with requests.get(url, stream=True, timeout=300) as r:
	r.raise_for_status()
	with open(local, "wb") as fp:
	for chunk in r.iter_content(chunk_size=1 << 20): # 1 MB chunks
	fp.write(chunk)
	api.upload_file(
	path_or_fileobj=str(local),
	path_in_repo=dest_path_in_repo,
	repo_id=DEST_REPO,
	repo_type="model",
	commit_message=commit_message,
	)
	local.unlink(missing_ok=True)


	# ---------------------------------------------------------------------------
	# Main
	# ---------------------------------------------------------------------------
	def main() -> None:
	# 1. VACE-14B (largest — do first while disk is freshest)
	stream_repo(
	"Wan-AI/Wan2.1-VACE-14B-diffusers",
	dest_prefix="vace-14b",
	exclude_globs=["assets/*", ".gitattributes"],
	)

	# 2. 4-step distill LoRA (single file)
	stream_repo(
	"lightx2v/Wan2.1-Distill-Loras",
	dest_prefix="loras",
	exclude_globs=[
	"*.md",
	".gitattributes",
	# Pull only the rank-64 t2v 4-step LoRA — matches vace.py 8-step plan
	"i2v",
	"rank32",
	"rank128",
	],
	)

	# 3. LaMa from GitHub release
	stream_url(
	url="https://github.com/enesmsahin/simple-lama-inpainting/releases/download/v0.1.0/big-lama.pt",
	dest_path_in_repo="lama/big-lama.pt",
	commit_message="Mirror big-lama.pt from simple-lama-inpainting v0.1.0 GitHub release",
	)

	print("\n✅ All mirrors complete.")


	if __name__ == "__main__":
	main()