ViTeX-Bench
/

ViTeX-Edit-14B

@@ -1,17 +1,9 @@
 """
-ViTeX-14B inference example.
-Loads:
-  - Wan-AI/Wan2.1-VACE-14B (base model)
-  - ViTeX-Bench/ViTeX-14B   (this fine-tuned VACE module)
-Runs one or more video text-edit jobs, writing MP4 outputs.
-Requires:
-  - The DiffSynth-Studio-TextVACE fork (provides GlyphEncoder + ConditionCrossAttention)
-  - torch >= 2.7.0+cu128 (NCCL >= 2.25.1 recommended on H100)
-  - One NVIDIA GPU with >= 80 GB VRAM (H100 / A100 80 GB)
-  - imageio-ffmpeg, opencv-python
 Usage:
   python inference_example.py \
@@ -20,21 +12,32 @@ Usage:
       --glyph_video  path/to/target_glyph.mp4 \
       --prompt       "Change the sign to read 'HILTON'" \
       --output       out.mp4
 """
 import os
 import argparse
 import glob
 import torch
 from PIL import Image
-from huggingface_hub import snapshot_download
 from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig
 from diffsynth.core import load_state_dict
 HEIGHT = 720
 WIDTH  = 1280
 NUM_FRAMES = 121
@@ -44,7 +47,8 @@ SEED = 42
 def load_video_frames(path, target_frames=NUM_FRAMES, resize=(HEIGHT, WIDTH)):
-    """Load a video file into a list of PIL Images, optionally subsampling/padding."""
     import cv2
     cap = cv2.VideoCapture(path)
     frames = []
@@ -60,7 +64,6 @@ def load_video_frames(path, target_frames=NUM_FRAMES, resize=(HEIGHT, WIDTH)):
     if not frames:
         raise ValueError(f"empty video: {path}")
     if target_frames and len(frames) > target_frames:
         import numpy as np
         idx = np.linspace(0, len(frames) - 1, target_frames, dtype=int)
@@ -93,21 +96,31 @@ def save_video(frames, path, fps=24):
     proc.wait()
-def build_pipeline(base_dir, ckpt_path, device="cuda:0"):
-    diffusion_shards = sorted(glob.glob(os.path.join(base_dir, "diffusion_pytorch_model-*.safetensors")))
     pipe = WanVideoPipeline.from_pretrained(
         torch_dtype=torch.bfloat16,
         device=device,
         model_configs=[
             ModelConfig(path=diffusion_shards),
-            ModelConfig(path=os.path.join(base_dir, "models_t5_umt5-xxl-enc-bf16.pth")),
-            ModelConfig(path=os.path.join(base_dir, "Wan2.1_VAE.pth")),
         ],
-        tokenizer_config=ModelConfig(path=os.path.join(base_dir, "google/umt5-xxl")),
         redirect_common_files=False,
     )
-    print(f"Loading ViTeX-14B weights from {ckpt_path}")
-    state = load_state_dict(ckpt_path)
     res = pipe.vace.load_state_dict(state, strict=False)
     print(f"  loaded {len(state)} keys (missing {len(res.missing_keys)}, unexpected {len(res.unexpected_keys)})")
     del state
@@ -116,9 +129,9 @@ def build_pipeline(base_dir, ckpt_path, device="cuda:0"):
 def main():
     p = argparse.ArgumentParser()
-    p.add_argument("--vace_video",  required=True, help="Source RGB video (the one to edit).")
     p.add_argument("--vace_mask",   required=True, help="Per-frame binary mask: 1=replace, 0=keep.")
-    p.add_argument("--glyph_video", required=True, help="Pre-rendered target glyphs placed in the mask region.")
     p.add_argument("--prompt",      default="", help="Optional text prompt describing the edit.")
     p.add_argument("--output",      default="output.mp4")
     p.add_argument("--height", type=int, default=HEIGHT)
@@ -130,23 +143,13 @@ def main():
     p.add_argument("--device", default="cuda:0")
     args = p.parse_args()
-    # 1. Download base + this model
-    print("Downloading Wan-AI/Wan2.1-VACE-14B (base, ~60 GB)...")
-    base_dir  = snapshot_download("Wan-AI/Wan2.1-VACE-14B")
-    print("Downloading ViTeX-Bench/ViTeX-14B (this model, ~8 GB)...")
-    vitex_dir = snapshot_download("ViTeX-Bench/ViTeX-14B")
-    ckpt_path = os.path.join(vitex_dir, "vitex_14b.safetensors")
-    # 2. Build pipeline
-    pipe = build_pipeline(base_dir, ckpt_path, device=args.device)
-    # 3. Load inputs
     target_size = (args.height, args.width)
     vace_video = load_video_frames(args.vace_video,  args.num_frames, target_size)
     vace_mask  = load_video_frames(args.vace_mask,   args.num_frames, target_size)
     glyph      = load_video_frames(args.glyph_video, args.num_frames, target_size)
-    # 4. Run
     print(f"Running pipeline (seed={args.seed}, cfg={args.cfg_scale}, steps={args.num_inference_steps})...")
     out_frames = pipe(
         prompt=args.prompt,

 """
+ViTeX-14B inference example (self-contained).
+Assumes you cloned this HuggingFace repo and are running this script from the
+repo root. The bundled `diffsynth/` library, `vitex_14b.safetensors` weights,
+and the full `base_model/` directory are picked up automatically.
 Usage:
   python inference_example.py \
       --glyph_video  path/to/target_glyph.mp4 \
       --prompt       "Change the sign to read 'HILTON'" \
       --output       out.mp4
+Hardware:
+  - 1 × NVIDIA GPU with >= 80 GB VRAM (peak ~70 GB at 720 × 1280 × 121 frames)
+  - ~250 GB CPU RAM recommended (DiT loading + activation offload)
 """
 import os
+import sys
 import argparse
 import glob
+# Use the bundled diffsynth shipped alongside this script.
+HERE = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(0, HERE)
 import torch
 from PIL import Image
 from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig
 from diffsynth.core import load_state_dict
+BASE_DIR    = os.path.join(HERE, "base_model")
+ADAPTER_CKPT = os.path.join(HERE, "vitex_14b.safetensors")
+TOKENIZER_DIR = os.path.join(BASE_DIR, "google", "umt5-xxl")
 HEIGHT = 720
 WIDTH  = 1280
 NUM_FRAMES = 121
 def load_video_frames(path, target_frames=NUM_FRAMES, resize=(HEIGHT, WIDTH)):
+    """Load a video file into a list of PIL Images, sub-sampled or padded to
+    `target_frames`, optionally resized to `(H, W)`."""
     import cv2
     cap = cv2.VideoCapture(path)
     frames = []
     if not frames:
         raise ValueError(f"empty video: {path}")
     if target_frames and len(frames) > target_frames:
         import numpy as np
         idx = np.linspace(0, len(frames) - 1, target_frames, dtype=int)
     proc.wait()
+def build_pipeline(device="cuda:0"):
+    diffusion_shards = sorted(glob.glob(os.path.join(BASE_DIR, "diffusion_pytorch_model-*.safetensors")))
+    if not diffusion_shards:
+        raise FileNotFoundError(
+            f"No diffusion_pytorch_model-*.safetensors found under {BASE_DIR}. "
+            "Make sure you downloaded the full repo via `git lfs clone` or "
+            "`huggingface-cli download ViTeX-Bench/ViTeX-14B`."
+        )
+    if not os.path.isfile(ADAPTER_CKPT):
+        raise FileNotFoundError(f"Missing trained adapter: {ADAPTER_CKPT}")
     pipe = WanVideoPipeline.from_pretrained(
         torch_dtype=torch.bfloat16,
         device=device,
         model_configs=[
             ModelConfig(path=diffusion_shards),
+            ModelConfig(path=os.path.join(BASE_DIR, "models_t5_umt5-xxl-enc-bf16.pth")),
+            ModelConfig(path=os.path.join(BASE_DIR, "Wan2.1_VAE.pth")),
         ],
+        tokenizer_config=ModelConfig(path=TOKENIZER_DIR),
         redirect_common_files=False,
     )
+    print(f"Loading ViTeX-14B trained weights from {ADAPTER_CKPT}")
+    state = load_state_dict(ADAPTER_CKPT)
     res = pipe.vace.load_state_dict(state, strict=False)
     print(f"  loaded {len(state)} keys (missing {len(res.missing_keys)}, unexpected {len(res.unexpected_keys)})")
     del state
 def main():
     p = argparse.ArgumentParser()
+    p.add_argument("--vace_video",  required=True, help="Source RGB video to edit.")
     p.add_argument("--vace_mask",   required=True, help="Per-frame binary mask: 1=replace, 0=keep.")
+    p.add_argument("--glyph_video", required=True, help="Pre-rendered target glyphs in the mask region.")
     p.add_argument("--prompt",      default="", help="Optional text prompt describing the edit.")
     p.add_argument("--output",      default="output.mp4")
     p.add_argument("--height", type=int, default=HEIGHT)
     p.add_argument("--device", default="cuda:0")
     args = p.parse_args()
+    pipe = build_pipeline(device=args.device)
     target_size = (args.height, args.width)
     vace_video = load_video_frames(args.vace_video,  args.num_frames, target_size)
     vace_mask  = load_video_frames(args.vace_mask,   args.num_frames, target_size)
     glyph      = load_video_frames(args.glyph_video, args.num_frames, target_size)
     print(f"Running pipeline (seed={args.seed}, cfg={args.cfg_scale}, steps={args.num_inference_steps})...")
     out_frames = pipe(
         prompt=args.prompt,