Jayce-Ping commited on 13 days ago

Commit

7cdb0ca

verified ·

1 Parent(s): 6f7040a

Add files using upload-large-folder tool

Browse files

Files changed (29) hide show

sequence/data_generation.py +336 -0
sequence/test.py +376 -0
sudoku/convert.py +81 -0
sudoku/convert_wan.py +1287 -0
sudoku/generate_dataset.py +424 -0
sudoku/jsonl_to_csv.py +22 -0
sudoku/simplify_dataset.py +19 -0
sudoku/sudoku/checkpoints/Wan2.1-I2V-14B-720P_full_0206/epoch-0-diffusers/diffusion_pytorch_model-00001-of-00007.safetensors +3 -0
sudoku/sudoku/checkpoints/Wan2.1-I2V-14B-720P_full_0206/epoch-0-diffusers/diffusion_pytorch_model-00002-of-00007.safetensors +3 -0
sudoku/sudoku/checkpoints/Wan2.1-I2V-14B-720P_full_0206/epoch-0-diffusers/diffusion_pytorch_model-00003-of-00007.safetensors +3 -0
sudoku/sudoku/checkpoints/Wan2.1-I2V-14B-720P_full_0206/epoch-0-diffusers/diffusion_pytorch_model-00004-of-00007.safetensors +3 -0
sudoku/sudoku/checkpoints/Wan2.1-I2V-14B-720P_full_0206/epoch-0-diffusers/diffusion_pytorch_model-00005-of-00007.safetensors +3 -0
sudoku/sudoku/checkpoints/Wan2.1-I2V-14B-720P_full_0206/epoch-0-diffusers/diffusion_pytorch_model-00006-of-00007.safetensors +3 -0
sudoku/sudoku/checkpoints/Wan2.1-I2V-14B-720P_full_0206/epoch-0-diffusers/diffusion_pytorch_model-00007-of-00007.safetensors +3 -0
sudoku/sudoku/checkpoints/Wan2.1-I2V-14B-720P_full_0206/epoch-0.safetensors +3 -0
sudoku/sudoku/checkpoints/Wan2.2-TI2V-5B_full/epoch-0.safetensors +3 -0
sudoku/sudoku/checkpoints/Wan2.2-TI2V-5B_full/epoch-1.safetensors +3 -0
sudoku/sudoku/checkpoints/Wan2.2-TI2V-5B_full/epoch-2.safetensors +3 -0
sudoku/sudoku/checkpoints/Wan2.2-TI2V-5B_full/epoch-3.safetensors +3 -0
sudoku/sudoku/checkpoints/Wan2.2-TI2V-5B_full/epoch-4.safetensors +3 -0
sudoku/sudoku_600/checkpoints/Wan2.1-I2V-14B-720P_full_1_31/epoch-3.safetensors +3 -0
sudoku/sudoku_600/checkpoints/Wan2.1-I2V-14B-720P_full_1_31_diffusers/diffusion_pytorch_model-00001-of-00007.safetensors +3 -0
sudoku/sudoku_600/checkpoints/Wan2.1-I2V-14B-720P_full_1_31_diffusers/diffusion_pytorch_model-00002-of-00007.safetensors +3 -0
sudoku/sudoku_600/checkpoints/Wan2.1-I2V-14B-720P_full_1_31_diffusers/diffusion_pytorch_model-00003-of-00007.safetensors +3 -0
sudoku/sudoku_600/checkpoints/Wan2.1-I2V-14B-720P_full_1_31_diffusers/diffusion_pytorch_model-00004-of-00007.safetensors +3 -0
sudoku/sudoku_600/checkpoints/Wan2.1-I2V-14B-720P_full_1_31_diffusers/diffusion_pytorch_model-00005-of-00007.safetensors +3 -0
sudoku/sudoku_600/checkpoints/Wan2.1-I2V-14B-720P_full_1_31_diffusers/diffusion_pytorch_model-00006-of-00007.safetensors +3 -0
sudoku/sudoku_600/checkpoints/Wan2.1-I2V-14B-720P_full_1_31_diffusers/diffusion_pytorch_model-00007-of-00007.safetensors +3 -0
sudoku/sudoku_processor.py +479 -0

sequence/data_generation.py ADDED Viewed

	@@ -0,0 +1,336 @@

+"""
+Sequence Prediction Dataset Generator.
+Generates image pairs for sequence prediction tasks with various
+mathematical sequences (arithmetic, geometric, fibonacci, etc.)
+"""
+import json
+import random
+from pathlib import Path
+from typing import Callable
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+# ============== Sequence Generators ==============
+def arithmetic_seq(start: int, diff: int, length: int = 4) -> list[int]:
+    """Arithmetic sequence: a, a+d, a+2d, ..."""
+    return [start + i * diff for i in range(length)]
+def geometric_seq(start: int, ratio: int, length: int = 4) -> list[int]:
+    """Geometric sequence: a, a*r, a*r^2, ..."""
+    return [start * (ratio ** i) for i in range(length)]
+def square_seq(start: int, length: int = 4) -> list[int]:
+    """Square numbers: n^2, (n+1)^2, ..."""
+    return [(start + i) ** 2 for i in range(length)]
+def cube_seq(start: int, length: int = 4) -> list[int]:
+    """Cube numbers: n^3, (n+1)^3, ..."""
+    return [(start + i) ** 3 for i in range(length)]
+def triangular_seq(start: int, length: int = 4) -> list[int]:
+    """Triangular numbers: n(n+1)/2"""
+    return [(start + i) * (start + i + 1) // 2 for i in range(length)]
+def fibonacci_like_seq(a: int, b: int, length: int = 4) -> list[int]:
+    """Fibonacci-like: a, b, a+b, a+2b, ..."""
+    seq = [a, b]
+    for _ in range(length - 2):
+        seq.append(seq[-1] + seq[-2])
+    return seq[:length]
+def prime_seq(start_idx: int, length: int = 4) -> list[int]:
+    """Prime numbers starting from index."""
+    primes = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47]
+    return primes[start_idx:start_idx + length]
+def power_of_two_seq(start: int, length: int = 4) -> list[int]:
+    """Powers of 2: 2^n, 2^(n+1), ..."""
+    return [2 ** (start + i) for i in range(length)]
+def factorial_seq(start: int, length: int = 4) -> list[int]:
+    """Factorial sequence: n!, (n+1)!, ..."""
+    from math import factorial
+    return [factorial(start + i) for i in range(length)]
+# ============== Sequence Factory ==============
+SEQUENCE_TYPES = {
+    "arithmetic": lambda rng: arithmetic_seq(
+        rng.randint(1, 20), rng.randint(1, 10)
+    ),
+    "arithmetic_neg": lambda rng: arithmetic_seq(
+        rng.randint(20, 50), -rng.randint(1, 5)
+    ),
+    "geometric_2": lambda rng: geometric_seq(
+        rng.randint(1, 5), 2
+    ),
+    "geometric_3": lambda rng: geometric_seq(
+        rng.randint(1, 3), 3
+    ),
+    "square": lambda rng: square_seq(rng.randint(1, 10)),
+    "cube": lambda rng: cube_seq(rng.randint(1, 5)),
+    "triangular": lambda rng: triangular_seq(rng.randint(1, 10)),
+    "fibonacci": lambda rng: fibonacci_like_seq(
+        rng.randint(1, 5), rng.randint(1, 5)
+    ),
+    "prime": lambda rng: prime_seq(rng.randint(0, 10)),
+    "power_of_2": lambda rng: power_of_two_seq(rng.randint(0, 6)),
+}
+def generate_sequence_pair(seq: list[int]) -> tuple[list, list]:
+    """
+    Generate a pair of sequences for the task.
+    Returns:
+        (partial, complete): partial has last element as "", complete is full.
+    """
+    partial = seq[:-1] + [""]
+    return partial, seq
+# ============== Image Generation ==============
+def round_to_multiple(x: int, multiple: int = 16) -> int:
+    """Round x up to nearest multiple."""
+    return ((x + multiple - 1) // multiple) * multiple
+def create_number_grid(
+    numbers: list,
+    save_path: str,
+    height: int = 224,
+    width: int = 896,
+    fontsize: int = 48,
+    size_multiple: int = 16,
+) -> None:
+    """
+    Create a 1xN grid image with numbers in each cell.
+    Args:
+        numbers: List of numbers/strings to display.
+        save_path: Output file path.
+        height: Target height in pixels (will be rounded to size_multiple).
+        width: Target width in pixels (will be rounded to size_multiple).
+        fontsize: Font size for the numbers.
+        size_multiple: Ensure dimensions are multiples of this (default 16).
+    """
+    from PIL import Image
+    n = len(numbers)
+    # Ensure dimensions are multiples of size_multiple
+    width = round_to_multiple(width, size_multiple)
+    height = round_to_multiple(height, size_multiple)
+    # Use fixed DPI and calculate figsize
+    dpi = 100
+    fig_width = width / dpi
+    fig_height = height / dpi
+    fig, ax = plt.subplots(figsize=(fig_width, fig_height), dpi=dpi)
+    fig.subplots_adjust(left=0, right=1, top=1, bottom=0)
+    for i, num in enumerate(numbers):
+        rect = patches.Rectangle(
+            (i, 0), 1, 1, linewidth=2,
+            edgecolor='black', facecolor='white'
+        )
+        ax.add_patch(rect)
+        ax.text(
+            i + 0.5, 0.5, str(num), fontsize=fontsize,
+            ha='center', va='center', fontweight='bold'
+        )
+    ax.set_xlim(0, n)
+    ax.set_ylim(0, 1)
+    ax.set_aspect('equal')
+    ax.axis('off')
+    # Save with exact pixel dimensions
+    fig.savefig(save_path, dpi=dpi, facecolor='white', edgecolor='none')
+    plt.close(fig)
+    # Final resize to ensure exact dimensions (16 multiples)
+    img = Image.open(save_path)
+    if img.size != (width, height):
+        img = img.resize((width, height), Image.Resampling.LANCZOS)
+        img.save(save_path)
+# ============== Dataset Generation ==============
+class SequenceDatasetGenerator:
+    """Generate sequence prediction dataset with train/test splits."""
+    def __init__(
+        self,
+        output_dir: str,
+        seed: int = 42,
+        num_pairs: tuple[int, int] = (2, 3),
+        seq_types: list[str] | None = None,
+        image_height: int = 224,
+        image_width: int = 896,
+        fontsize: int = 48,
+    ):
+        """
+        Args:
+            output_dir: Directory to save the dataset.
+            seed: Random seed for reproducibility.
+            num_pairs: Range of pairs per sample (min, max inclusive).
+            seq_types: List of sequence types to use (None = all).
+            image_height: Image height in pixels (rounded to 16).
+            image_width: Image width in pixels (rounded to 16).
+            fontsize: Font size for numbers.
+        """
+        self.output_dir = Path(output_dir)
+        self.rng = random.Random(seed)
+        self.num_pairs = num_pairs
+        self.seq_types = seq_types or list(SEQUENCE_TYPES.keys())
+        self.image_height = round_to_multiple(image_height, 16)
+        self.image_width = round_to_multiple(image_width, 16)
+        self.fontsize = fontsize
+        # Create directories
+        for split in ["train", "test"]:
+            (self.output_dir / split / "images").mkdir(parents=True, exist_ok=True)
+    def _generate_sample(self, sample_id: int) -> dict:
+        """Generate a single sample with multiple sequence pairs."""
+        num_pairs = self.rng.randint(*self.num_pairs)
+        seq_type = self.rng.choice(self.seq_types)
+        # Generate base sequence and subsequent ones
+        base_seq = SEQUENCE_TYPES[seq_type](self.rng)
+        pairs = []
+        for i in range(num_pairs):
+            # Shift sequence for each pair
+            if seq_type.startswith("arithmetic"):
+                diff = base_seq[1] - base_seq[0]
+                seq = [x + i * diff for x in base_seq]
+            elif seq_type.startswith("geometric"):
+                ratio = base_seq[1] // base_seq[0] if base_seq[0] != 0 else 2
+                seq = [x * (ratio ** i) for x in base_seq]
+            else:
+                # For other types, regenerate with offset
+                seq = [x + i for x in base_seq]
+            partial, complete = generate_sequence_pair(seq)
+            pairs.append({
+                "partial": partial,
+                "complete": complete,
+                "answer": complete[-1],
+            })
+        return {
+            "id": sample_id,
+            "seq_type": seq_type,
+            "num_pairs": num_pairs,
+            "pairs": pairs,
+        }
+    def _save_sample_images(
+        self, sample: dict, split: str, include_last_answer: bool = True
+    ) -> dict:
+        """Save images for a sample and return metadata."""
+        sample_id = sample["id"]
+        image_dir = self.output_dir / split / "images"
+        images = []
+        img_idx = 0
+        for i, pair in enumerate(sample["pairs"]):
+            # Always save partial (query) image
+            partial_path = f"{sample_id:05d}_{img_idx}.png"
+            create_number_grid(
+                pair["partial"], image_dir / partial_path,
+                height=self.image_height, width=self.image_width,
+                fontsize=self.fontsize,
+            )
+            images.append(partial_path)
+            img_idx += 1
+            # Save complete image based on split logic
+            is_last = (i == sample["num_pairs"] - 1)
+            if include_last_answer or not is_last:
+                complete_path = f"{sample_id:05d}_{img_idx}.png"
+                create_number_grid(
+                    pair["complete"], image_dir / complete_path,
+                    height=self.image_height, width=self.image_width,
+                    fontsize=self.fontsize,
+                )
+                images.append(complete_path)
+            img_idx += 1
+        return {
+            "id": sample_id,
+            "seq_type": sample["seq_type"],
+            "num_pairs": sample["num_pairs"],
+            "images": images,
+            "answer": sample["pairs"][-1]["answer"],  # Last image's answer
+            "sequences": [p["complete"] for p in sample["pairs"]],
+        }
+    def generate(self, num_train: int, num_test: int) -> None:
+        """
+        Generate the full dataset.
+        Args:
+            num_train: Number of training samples.
+            num_test: Number of test samples.
+        """
+        train_meta, test_meta = [], []
+        # Generate training samples (all pairs complete)
+        print(f"Generating {num_train} training samples...")
+        for i in range(num_train):
+            sample = self._generate_sample(i)
+            meta = self._save_sample_images(sample, "train", include_last_answer=True)
+            train_meta.append(meta)
+            if (i + 1) % 50 == 0:
+                print(f"  Train: {i + 1}/{num_train}")
+        # Generate test samples (last answer hidden)
+        print(f"Generating {num_test} test samples...")
+        for i in range(num_test):
+            sample = self._generate_sample(num_train + i)
+            meta = self._save_sample_images(sample, "test", include_last_answer=False)
+            test_meta.append(meta)
+            if (i + 1) % 50 == 0:
+                print(f"  Test: {i + 1}/{num_test}")
+        # Save metadata
+        with open(self.output_dir / "train.json", "w") as f:
+            json.dump(train_meta, f, indent=2)
+        with open(self.output_dir / "test.json", "w") as f:
+            json.dump(test_meta, f, indent=2)
+        print(f"\nDataset saved to {self.output_dir}")
+        print(f"  Train: {num_train} samples")
+        print(f"  Test: {num_test} samples")
+        print(f"  Image size: {self.image_width}x{self.image_height}")
+        print(f"  Sequence types: {self.seq_types}")
+if __name__ == "__main__":
+    generator = SequenceDatasetGenerator(
+        output_dir="/home/claude/sequence_dataset",
+        seed=42,
+        num_pairs=(2, 3),
+    )
+    generator.generate(num_train=100, num_test=20)

sequence/test.py ADDED Viewed

	@@ -0,0 +1,376 @@

+"""
+Sequence Prediction Evaluation with QwenImageEditPlusPipeline / Flux2KleinPipeline.
+Evaluates the model's ability to predict the next number in a sequence
+by generating images and extracting answers via OCR.
+"""
+import json
+import re
+from pathlib import Path
+from dataclasses import dataclass, field
+from enum import Enum
+import numpy as np
+import torch
+from PIL import Image
+from tqdm import tqdm
+class ModelType(str, Enum):
+    QWEN_IMAGE_EDIT = "qwen"
+    FLUX2_KLEIN = "flux2-klein"
+@dataclass
+class EvalConfig:
+    """Evaluation configuration."""
+    dataset_dir: str = "sequence_dataset"
+    output_dir: str = "eval_results"
+    # Model selection
+    model_type: ModelType = ModelType.QWEN_IMAGE_EDIT
+    model_id: str = ""  # Auto-set based on model_type if empty
+    # Prompts
+    prompt: str = (
+        "Based on the number patterns shown in the previous images, "
+        "fill in the missing number in the empty cell of the last image."
+    )
+    negative_prompt: str = ""
+    # Generation params
+    num_inference_steps: int = 5
+    guidance_scale: float = 1.0
+    true_cfg_scale: float = 4.0  # For Qwen
+    height: int = 210
+    width: int = 750
+    seed: int = 42
+    device: str = "cuda"
+    dtype: torch.dtype = field(default_factory=lambda: torch.bfloat16)
+    def __post_init__(self):
+        """Set default model_id based on model_type."""
+        if not self.model_id:
+            if self.model_type == ModelType.QWEN_IMAGE_EDIT:
+                self.model_id = "Qwen/Qwen-Image-Edit-2509"
+            elif self.model_type == ModelType.FLUX2_KLEIN:
+                self.model_id = "black-forest-labs/FLUX.2-klein-9B"
+class OCRExtractor:
+    """Extract numbers from grid images using OCR."""
+    def __init__(self, backend: str = "easyocr"):
+        """
+        Args:
+            backend: OCR backend ("easyocr" or "pytesseract").
+        """
+        self.backend = backend
+        if backend == "easyocr":
+            import easyocr
+            self.reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())
+        elif backend == "pytesseract":
+            import pytesseract
+            self.pytesseract = pytesseract
+        else:
+            raise ValueError(f"Unknown backend: {backend}")
+    def extract_last_number(self, image: Image.Image) -> int | None:
+        """
+        Extract the last (rightmost) number from a grid image.
+        Args:
+            image: PIL Image of the number grid.
+        Returns:
+            Extracted number or None if extraction fails.
+        """
+        w, h = image.size
+        cell_crop = image.crop((w * 3 // 4, 0, w, h))
+        cell_array = np.array(cell_crop)
+        if self.backend == "easyocr":
+            results = self.reader.readtext(cell_array)
+            for _, text, conf in results:
+                digits = re.findall(r'-?\d+', text)
+                if digits:
+                    return int(digits[0])
+        elif self.backend == "pytesseract":
+            text = self.pytesseract.image_to_string(
+                cell_crop, config='--psm 7 -c tessedit_char_whitelist=0123456789-'
+            )
+            digits = re.findall(r'-?\d+', text)
+            if digits:
+                return int(digits[0])
+        return None
+    def extract_all_numbers(self, image: Image.Image, num_cells: int = 4) -> list[int | None]:
+        """Extract all numbers from a grid image."""
+        w, h = image.size
+        cell_width = w // num_cells
+        numbers = []
+        for i in range(num_cells):
+            cell_crop = image.crop((i * cell_width, 0, (i + 1) * cell_width, h))
+            cell_array = np.array(cell_crop)
+            if self.backend == "easyocr":
+                results = self.reader.readtext(cell_array)
+                num = None
+                for _, text, conf in results:
+                    digits = re.findall(r'-?\d+', text)
+                    if digits:
+                        num = int(digits[0])
+                        break
+                numbers.append(num)
+            elif self.backend == "pytesseract":
+                text = self.pytesseract.image_to_string(
+                    cell_crop, config='--psm 7 -c tessedit_char_whitelist=0123456789-'
+                )
+                digits = re.findall(r'-?\d+', text)
+                numbers.append(int(digits[0]) if digits else None)
+        return numbers
+class SequenceEvaluator:
+    """Evaluator for sequence prediction task."""
+    def __init__(self, config: EvalConfig):
+        self.config = config
+        self.output_dir = Path(config.output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        # Load pipeline based on model type
+        self.pipeline = self._load_pipeline()
+        # Initialize OCR
+        self.ocr = OCRExtractor(backend="easyocr")
+    def _load_pipeline(self):
+        """Load pipeline based on model type."""
+        if self.config.model_type == ModelType.QWEN_IMAGE_EDIT:
+            return self._load_qwen_pipeline()
+        elif self.config.model_type == ModelType.FLUX2_KLEIN:
+            return self._load_flux2_klein_pipeline()
+        else:
+            raise ValueError(f"Unknown model type: {self.config.model_type}")
+    def _load_qwen_pipeline(self):
+        """Load QwenImageEditPlusPipeline."""
+        from diffusers import QwenImageEditPlusPipeline
+        pipeline = QwenImageEditPlusPipeline.from_pretrained(
+            self.config.model_id,
+            torch_dtype=self.config.dtype,
+        )
+        pipeline.to(self.config.device)
+        pipeline.set_progress_bar_config(disable=True)
+        return pipeline
+    def _load_flux2_klein_pipeline(self):
+        """Load Flux2KleinPipeline."""
+        from diffusers import Flux2KleinPipeline
+        pipeline = Flux2KleinPipeline.from_pretrained(
+            self.config.model_id,
+            torch_dtype=self.config.dtype,
+        )
+        pipeline.enable_model_cpu_offload()
+        pipeline.set_progress_bar_config(disable=True)
+        return pipeline
+    def _load_images(self, image_paths: list[str], image_dir: Path) -> list[Image.Image]:
+        """Load images from paths."""
+        return [Image.open(image_dir / p).convert("RGB") for p in image_paths]
+    def predict(self, images: list[Image.Image]) -> Image.Image:
+        """
+        Generate prediction image given input images.
+        Args:
+            images: List of input images (context + query).
+        Returns:
+            Generated image with predicted number.
+        """
+        generator = torch.Generator(device=self.config.device).manual_seed(self.config.seed)
+        if self.config.model_type == ModelType.QWEN_IMAGE_EDIT:
+            inputs = {
+                "image": images,
+                "prompt": self.config.prompt,
+                "generator": generator,
+                "true_cfg_scale": self.config.true_cfg_scale,
+                "negative_prompt": self.config.negative_prompt,
+                "num_inference_steps": self.config.num_inference_steps,
+            }
+        elif self.config.model_type == ModelType.FLUX2_KLEIN:
+            # Flux2Klein uses image parameter for multi-image editing
+            inputs = {
+                "image": images,
+                "prompt": self.config.prompt,
+                "generator": generator,
+                "guidance_scale": self.config.guidance_scale,
+                "num_inference_steps": self.config.num_inference_steps,
+                "height": self.config.height,
+                "width": self.config.width,
+            }
+        with torch.inference_mode():
+            output = self.pipeline(**inputs)
+        return output.images[0]
+    def evaluate_sample(self, sample: dict, image_dir: Path) -> dict:
+        """
+        Evaluate a single sample.
+        Args:
+            sample: Sample metadata dict.
+            image_dir: Directory containing images.
+        Returns:
+            Evaluation result dict.
+        """
+        # Load input images (all available in test set)
+        images = self._load_images(sample["images"], image_dir)
+        # Generate prediction
+        pred_image = self.predict(images)
+        # Save prediction image
+        pred_path = self.output_dir / f"{sample['id']:05d}_pred.png"
+        pred_image.save(pred_path)
+        # Extract predicted number via OCR
+        pred_number = self.ocr.extract_last_number(pred_image)
+        # Get ground truth
+        gt_number = sample["answer"]
+        # Check correctness
+        correct = pred_number == gt_number
+        return {
+            "id": sample["id"],
+            "seq_type": sample["seq_type"],
+            "gt_answer": gt_number,
+            "pred_answer": pred_number,
+            "correct": correct,
+            "pred_image": str(pred_path),
+        }
+    def evaluate(self, split: str = "test") -> dict:
+        """
+        Evaluate on entire dataset split.
+        Args:
+            split: Dataset split ("train" or "test").
+        Returns:
+            Evaluation results summary.
+        """
+        dataset_dir = Path(self.config.dataset_dir)
+        # Load metadata
+        with open(dataset_dir / f"{split}.json") as f:
+            samples = json.load(f)
+        image_dir = dataset_dir / split / "images"
+        results = []
+        for sample in tqdm(samples, desc=f"Evaluating {split}"):
+            result = self.evaluate_sample(sample, image_dir)
+            results.append(result)
+        # Compute metrics
+        total = len(results)
+        correct = sum(r["correct"] for r in results)
+        accuracy = correct / total if total > 0 else 0.0
+        # Per-type accuracy
+        type_stats = {}
+        for r in results:
+            seq_type = r["seq_type"]
+            if seq_type not in type_stats:
+                type_stats[seq_type] = {"correct": 0, "total": 0}
+            type_stats[seq_type]["total"] += 1
+            if r["correct"]:
+                type_stats[seq_type]["correct"] += 1
+        type_accuracy = {
+            k: v["correct"] / v["total"] for k, v in type_stats.items()
+        }
+        summary = {
+            "split": split,
+            "model_type": self.config.model_type.value,
+            "model_id": self.config.model_id,
+            "total": total,
+            "correct": correct,
+            "accuracy": accuracy,
+            "type_accuracy": type_accuracy,
+            "results": results,
+        }
+        # Save results
+        with open(self.output_dir / f"{split}_results.json", "w") as f:
+            json.dump(summary, f, indent=2)
+        return summary
+def main():
+    """Run evaluation."""
+    import argparse
+    parser = argparse.ArgumentParser(description="Sequence Prediction Evaluation")
+    parser.add_argument("--model", type=str, default="flux2-klein",
+                        choices=["qwen", "flux2-klein"],
+                        help="Model type to use")
+    parser.add_argument("--model-id", type=str, default="",
+                        help="Custom model ID (optional)")
+    parser.add_argument("--dataset-dir", type=str, default="sequence_dataset",
+                        help="Dataset directory")
+    parser.add_argument("--output-dir", type=str, default="eval_results",
+                        help="Output directory")
+    parser.add_argument("--steps", type=int, default=50,
+                        help="Number of inference steps")
+    parser.add_argument("--seed", type=int, default=42,
+                        help="Random seed")
+    args = parser.parse_args()
+    config = EvalConfig(
+        dataset_dir=args.dataset_dir,
+        output_dir=args.output_dir,
+        model_type=ModelType(args.model),
+        model_id=args.model_id,
+        num_inference_steps=args.steps,
+        seed=args.seed,
+    )
+    print(f"Model: {config.model_type.value} ({config.model_id})")
+    evaluator = SequenceEvaluator(config)
+    results = evaluator.evaluate("test")
+    print(f"\n{'='*50}")
+    print(f"Evaluation Results ({config.model_type.value})")
+    print(f"{'='*50}")
+    print(f"Total samples: {results['total']}")
+    print(f"Correct: {results['correct']}")
+    print(f"Accuracy: {results['accuracy']:.2%}")
+    print(f"\nPer-type accuracy:")
+    for seq_type, acc in sorted(results["type_accuracy"].items()):
+        print(f"  {seq_type}: {acc:.2%}")
+if __name__ == "__main__":
+    main()

sudoku/convert.py ADDED Viewed

	@@ -0,0 +1,81 @@

+"""
+将单个safetensors文件转换为HuggingFace Diffusers格式。
+Usage:
+    python convert_single.py --ckpt epoch-4.safetensors --model_type Wan-T2V-14B --output_path ./output
+"""
+import argparse
+import torch
+from safetensors.torch import load_file
+from accelerate import init_empty_weights
+# 从原脚本导入（或直接复制相关字典和函数）
+from convert_wan import (
+    get_transformer_config,
+    update_state_dict_,
+    DTYPE_MAPPING,
+)
+from diffusers import WanTransformer3DModel, WanVACETransformer3DModel, WanAnimateTransformer3DModel
+def convert_single_checkpoint(ckpt_path: str, model_type: str, dtype: str = "bf16"):
+    """
+    转换单个checkpoint文件为Diffusers格式Transformer。
+    Args:
+        ckpt_path: safetensors文件路径
+        model_type: 模型类型，如 "Wan-T2V-14B", "Wan-I2V-14B-720p" 等
+        dtype: 输出精度
+    Returns:
+        转换后的transformer模型
+    """
+    # 1. 获取配置和重命名规则
+    config, rename_dict, special_keys_remap = get_transformer_config(model_type)
+    diffusers_config = config["diffusers_config"]
+    # 2. 加载原始权重
+    state_dict = load_file(ckpt_path)
+    # 3. 重命名keys
+    for key in list(state_dict.keys()):
+        new_key = key
+        for old, new in rename_dict.items():
+            new_key = new_key.replace(old, new)
+        update_state_dict_(state_dict, key, new_key)
+    # 4. 处理特殊keys
+    for key in list(state_dict.keys()):
+        for special_key, handler_fn in special_keys_remap.items():
+            if special_key in key:
+                handler_fn(key, state_dict)
+    # 5. 创建模型并加载权重
+    with init_empty_weights():
+        if "Animate" in model_type:
+            transformer = WanAnimateTransformer3DModel.from_config(diffusers_config)
+        elif "VACE" in model_type:
+            transformer = WanVACETransformer3DModel.from_config(diffusers_config)
+        else:
+            transformer = WanTransformer3DModel.from_config(diffusers_config)
+    transformer.load_state_dict(state_dict, strict=True, assign=True)
+    if dtype != "none":
+        transformer = transformer.to(DTYPE_MAPPING[dtype])
+    return transformer
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ckpt", type=str, required=True, help="safetensors文件路径")
+    parser.add_argument("--model_type", type=str, required=True, help="模型类型")
+    parser.add_argument("--output_path", type=str, required=True, help="输出目录")
+    parser.add_argument("--dtype", type=str, default="bf16", choices=["fp32", "fp16", "bf16", "none"])
+    args = parser.parse_args()
+    transformer = convert_single_checkpoint(args.ckpt, args.model_type, args.dtype)
+    transformer.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")
+    print(f"Saved to {args.output_path}")

sudoku/convert_wan.py ADDED Viewed

	@@ -0,0 +1,1287 @@

+import argparse
+import pathlib
+from typing import Any, Dict, Tuple
+import torch
+from accelerate import init_empty_weights
+from huggingface_hub import hf_hub_download, snapshot_download
+from safetensors.torch import load_file
+from transformers import (
+    AutoProcessor,
+    AutoTokenizer,
+    CLIPImageProcessor,
+    CLIPVisionModel,
+    CLIPVisionModelWithProjection,
+    UMT5EncoderModel,
+)
+from diffusers import (
+    AutoencoderKLWan,
+    UniPCMultistepScheduler,
+    WanAnimatePipeline,
+    WanAnimateTransformer3DModel,
+    WanImageToVideoPipeline,
+    WanPipeline,
+    WanTransformer3DModel,
+    WanVACEPipeline,
+    WanVACETransformer3DModel,
+)
+TRANSFORMER_KEYS_RENAME_DICT = {
+    "time_embedding.0": "condition_embedder.time_embedder.linear_1",
+    "time_embedding.2": "condition_embedder.time_embedder.linear_2",
+    "text_embedding.0": "condition_embedder.text_embedder.linear_1",
+    "text_embedding.2": "condition_embedder.text_embedder.linear_2",
+    "time_projection.1": "condition_embedder.time_proj",
+    "head.modulation": "scale_shift_table",
+    "head.head": "proj_out",
+    "modulation": "scale_shift_table",
+    "ffn.0": "ffn.net.0.proj",
+    "ffn.2": "ffn.net.2",
+    # Hack to swap the layer names
+    # The original model calls the norms in following order: norm1, norm3, norm2
+    # We convert it to: norm1, norm2, norm3
+    "norm2": "norm__placeholder",
+    "norm3": "norm2",
+    "norm__placeholder": "norm3",
+    # For the I2V model
+    "img_emb.proj.0": "condition_embedder.image_embedder.norm1",
+    "img_emb.proj.1": "condition_embedder.image_embedder.ff.net.0.proj",
+    "img_emb.proj.3": "condition_embedder.image_embedder.ff.net.2",
+    "img_emb.proj.4": "condition_embedder.image_embedder.norm2",
+    # for the FLF2V model
+    "img_emb.emb_pos": "condition_embedder.image_embedder.pos_embed",
+    # Add attention component mappings
+    "self_attn.q": "attn1.to_q",
+    "self_attn.k": "attn1.to_k",
+    "self_attn.v": "attn1.to_v",
+    "self_attn.o": "attn1.to_out.0",
+    "self_attn.norm_q": "attn1.norm_q",
+    "self_attn.norm_k": "attn1.norm_k",
+    "cross_attn.q": "attn2.to_q",
+    "cross_attn.k": "attn2.to_k",
+    "cross_attn.v": "attn2.to_v",
+    "cross_attn.o": "attn2.to_out.0",
+    "cross_attn.norm_q": "attn2.norm_q",
+    "cross_attn.norm_k": "attn2.norm_k",
+    "attn2.to_k_img": "attn2.add_k_proj",
+    "attn2.to_v_img": "attn2.add_v_proj",
+    "attn2.norm_k_img": "attn2.norm_added_k",
+}
+VACE_TRANSFORMER_KEYS_RENAME_DICT = {
+    "time_embedding.0": "condition_embedder.time_embedder.linear_1",
+    "time_embedding.2": "condition_embedder.time_embedder.linear_2",
+    "text_embedding.0": "condition_embedder.text_embedder.linear_1",
+    "text_embedding.2": "condition_embedder.text_embedder.linear_2",
+    "time_projection.1": "condition_embedder.time_proj",
+    "head.modulation": "scale_shift_table",
+    "head.head": "proj_out",
+    "modulation": "scale_shift_table",
+    "ffn.0": "ffn.net.0.proj",
+    "ffn.2": "ffn.net.2",
+    # Hack to swap the layer names
+    # The original model calls the norms in following order: norm1, norm3, norm2
+    # We convert it to: norm1, norm2, norm3
+    "norm2": "norm__placeholder",
+    "norm3": "norm2",
+    "norm__placeholder": "norm3",
+    # # For the I2V model
+    # "img_emb.proj.0": "condition_embedder.image_embedder.norm1",
+    # "img_emb.proj.1": "condition_embedder.image_embedder.ff.net.0.proj",
+    # "img_emb.proj.3": "condition_embedder.image_embedder.ff.net.2",
+    # "img_emb.proj.4": "condition_embedder.image_embedder.norm2",
+    # # for the FLF2V model
+    # "img_emb.emb_pos": "condition_embedder.image_embedder.pos_embed",
+    # Add attention component mappings
+    "self_attn.q": "attn1.to_q",
+    "self_attn.k": "attn1.to_k",
+    "self_attn.v": "attn1.to_v",
+    "self_attn.o": "attn1.to_out.0",
+    "self_attn.norm_q": "attn1.norm_q",
+    "self_attn.norm_k": "attn1.norm_k",
+    "cross_attn.q": "attn2.to_q",
+    "cross_attn.k": "attn2.to_k",
+    "cross_attn.v": "attn2.to_v",
+    "cross_attn.o": "attn2.to_out.0",
+    "cross_attn.norm_q": "attn2.norm_q",
+    "cross_attn.norm_k": "attn2.norm_k",
+    "attn2.to_k_img": "attn2.add_k_proj",
+    "attn2.to_v_img": "attn2.add_v_proj",
+    "attn2.norm_k_img": "attn2.norm_added_k",
+    "before_proj": "proj_in",
+    "after_proj": "proj_out",
+}
+ANIMATE_TRANSFORMER_KEYS_RENAME_DICT = {
+    "time_embedding.0": "condition_embedder.time_embedder.linear_1",
+    "time_embedding.2": "condition_embedder.time_embedder.linear_2",
+    "text_embedding.0": "condition_embedder.text_embedder.linear_1",
+    "text_embedding.2": "condition_embedder.text_embedder.linear_2",
+    "time_projection.1": "condition_embedder.time_proj",
+    "head.modulation": "scale_shift_table",
+    "head.head": "proj_out",
+    "modulation": "scale_shift_table",
+    "ffn.0": "ffn.net.0.proj",
+    "ffn.2": "ffn.net.2",
+    # Hack to swap the layer names
+    # The original model calls the norms in following order: norm1, norm3, norm2
+    # We convert it to: norm1, norm2, norm3
+    "norm2": "norm__placeholder",
+    "norm3": "norm2",
+    "norm__placeholder": "norm3",
+    "img_emb.proj.0": "condition_embedder.image_embedder.norm1",
+    "img_emb.proj.1": "condition_embedder.image_embedder.ff.net.0.proj",
+    "img_emb.proj.3": "condition_embedder.image_embedder.ff.net.2",
+    "img_emb.proj.4": "condition_embedder.image_embedder.norm2",
+    # Add attention component mappings
+    "self_attn.q": "attn1.to_q",
+    "self_attn.k": "attn1.to_k",
+    "self_attn.v": "attn1.to_v",
+    "self_attn.o": "attn1.to_out.0",
+    "self_attn.norm_q": "attn1.norm_q",
+    "self_attn.norm_k": "attn1.norm_k",
+    "cross_attn.q": "attn2.to_q",
+    "cross_attn.k": "attn2.to_k",
+    "cross_attn.v": "attn2.to_v",
+    "cross_attn.o": "attn2.to_out.0",
+    "cross_attn.norm_q": "attn2.norm_q",
+    "cross_attn.norm_k": "attn2.norm_k",
+    "cross_attn.k_img": "attn2.to_k_img",
+    "cross_attn.v_img": "attn2.to_v_img",
+    "cross_attn.norm_k_img": "attn2.norm_k_img",
+    # After cross_attn -> attn2 rename, we need to rename the img keys
+    "attn2.to_k_img": "attn2.add_k_proj",
+    "attn2.to_v_img": "attn2.add_v_proj",
+    "attn2.norm_k_img": "attn2.norm_added_k",
+    # Wan Animate-specific mappings (motion encoder, face encoder, face adapter)
+    # Motion encoder mappings
+    # The name mapping is complicated for the convolutional part so we handle that in its own function
+    "motion_encoder.enc.fc": "motion_encoder.motion_network",
+    "motion_encoder.dec.direction.weight": "motion_encoder.motion_synthesis_weight",
+    # Face encoder mappings - CausalConv1d has a .conv submodule that we need to flatten
+    "face_encoder.conv1_local.conv": "face_encoder.conv1_local",
+    "face_encoder.conv2.conv": "face_encoder.conv2",
+    "face_encoder.conv3.conv": "face_encoder.conv3",
+    # Face adapter mappings are handled in a separate function
+}
+# TODO: Verify this and simplify if possible.
+def convert_animate_motion_encoder_weights(key: str, state_dict: Dict[str, Any], final_conv_idx: int = 8) -> None:
+    """
+    Convert all motion encoder weights for Animate model.
+    In the original model:
+    - All Linear layers in fc use EqualLinear
+    - All Conv2d layers in convs use EqualConv2d (except blur_conv which is initialized separately)
+    - Blur kernels are stored as buffers in Sequential modules
+    - ConvLayer is nn.Sequential with indices: [Blur (optional), EqualConv2d, FusedLeakyReLU (optional)]
+    Conversion strategy:
+    1. Drop .kernel buffers (blur kernels)
+    2. Rename sequential indices to named components (e.g., 0 -> conv2d, 1 -> bias_leaky_relu)
+    """
+    # Skip if not a weight, bias, or kernel
+    if ".weight" not in key and ".bias" not in key and ".kernel" not in key:
+        return
+    # Handle Blur kernel buffers from original implementation.
+    # After renaming, these appear under: motion_encoder.res_blocks.*.conv{2,skip}.blur_kernel
+    # Diffusers constructs blur kernels as a non-persistent buffer so we must drop these keys
+    if ".kernel" in key and "motion_encoder" in key:
+        # Remove unexpected blur kernel buffers to avoid strict load errors
+        state_dict.pop(key, None)
+        return
+    # Rename Sequential indices to named components in ConvLayer and ResBlock
+    if ".enc.net_app.convs." in key and (".weight" in key or ".bias" in key):
+        parts = key.split(".")
+        # Find the sequential index (digit) after convs or after conv1/conv2/skip
+        # Examples:
+        # - enc.net_app.convs.0.0.weight -> conv_in.weight (initial conv layer weight)
+        # - enc.net_app.convs.0.1.bias -> conv_in.act_fn.bias (initial conv layer bias)
+        # - enc.net_app.convs.{n:1-7}.conv1.0.weight -> res_blocks.{(n-1):0-6}.conv1.weight (conv1 weight)
+        #     - e.g. enc.net_app.convs.1.conv1.0.weight -> res_blocks.0.conv1.weight
+        # - enc.net_app.convs.{n:1-7}.conv1.1.bias -> res_blocks.{(n-1):0-6}.conv1.act_fn.bias (conv1 bias)
+        #     - e.g. enc.net_app.convs.1.conv1.1.bias -> res_blocks.0.conv1.act_fn.bias
+        # - enc.net_app.convs.{n:1-7}.conv2.1.weight -> res_blocks.{(n-1):0-6}.conv2.weight (conv2 weight)
+        # - enc.net_app.convs.1.conv2.2.bias -> res_blocks.0.conv2.act_fn.bias (conv2 bias)
+        # - enc.net_app.convs.{n:1-7}.skip.1.weight -> res_blocks.{(n-1):0-6}.conv_skip.weight (skip conv weight)
+        # - enc.net_app.convs.8 -> conv_out (final conv layer)
+        convs_idx = parts.index("convs") if "convs" in parts else -1
+        if convs_idx >= 0 and len(parts) - convs_idx >= 2:
+            bias = False
+            # The nn.Sequential index will always follow convs
+            sequential_idx = int(parts[convs_idx + 1])
+            if sequential_idx == 0:
+                if key.endswith(".weight"):
+                    new_key = "motion_encoder.conv_in.weight"
+                elif key.endswith(".bias"):
+                    new_key = "motion_encoder.conv_in.act_fn.bias"
+                    bias = True
+            elif sequential_idx == final_conv_idx:
+                if key.endswith(".weight"):
+                    new_key = "motion_encoder.conv_out.weight"
+            else:
+                # Intermediate .convs. layers, which get mapped to .res_blocks.
+                prefix = "motion_encoder.res_blocks."
+                layer_name = parts[convs_idx + 2]
+                if layer_name == "skip":
+                    layer_name = "conv_skip"
+                if key.endswith(".weight"):
+                    param_name = "weight"
+                elif key.endswith(".bias"):
+                    param_name = "act_fn.bias"
+                    bias = True
+                suffix_parts = [str(sequential_idx - 1), layer_name, param_name]
+                suffix = ".".join(suffix_parts)
+                new_key = prefix + suffix
+            param = state_dict.pop(key)
+            if bias:
+                param = param.squeeze()
+            state_dict[new_key] = param
+            return
+        return
+    return
+def convert_animate_face_adapter_weights(key: str, state_dict: Dict[str, Any]) -> None:
+    """
+    Convert face adapter weights for the Animate model.
+    The original model uses a fused KV projection but the diffusers models uses separate K and V projections.
+    """
+    # Skip if not a weight or bias
+    if ".weight" not in key and ".bias" not in key:
+        return
+    prefix = "face_adapter."
+    if ".fuser_blocks." in key:
+        parts = key.split(".")
+        module_list_idx = parts.index("fuser_blocks") if "fuser_blocks" in parts else -1
+        if module_list_idx >= 0 and (len(parts) - 1) - module_list_idx == 3:
+            block_idx = parts[module_list_idx + 1]
+            layer_name = parts[module_list_idx + 2]
+            param_name = parts[module_list_idx + 3]
+            if layer_name == "linear1_kv":
+                layer_name_k = "to_k"
+                layer_name_v = "to_v"
+                suffix_k = ".".join([block_idx, layer_name_k, param_name])
+                suffix_v = ".".join([block_idx, layer_name_v, param_name])
+                new_key_k = prefix + suffix_k
+                new_key_v = prefix + suffix_v
+                kv_proj = state_dict.pop(key)
+                k_proj, v_proj = torch.chunk(kv_proj, 2, dim=0)
+                state_dict[new_key_k] = k_proj
+                state_dict[new_key_v] = v_proj
+                return
+            else:
+                if layer_name == "q_norm":
+                    new_layer_name = "norm_q"
+                elif layer_name == "k_norm":
+                    new_layer_name = "norm_k"
+                elif layer_name == "linear1_q":
+                    new_layer_name = "to_q"
+                elif layer_name == "linear2":
+                    new_layer_name = "to_out"
+                suffix_parts = [block_idx, new_layer_name, param_name]
+                suffix = ".".join(suffix_parts)
+                new_key = prefix + suffix
+                state_dict[new_key] = state_dict.pop(key)
+                return
+    return
+TRANSFORMER_SPECIAL_KEYS_REMAP = {}
+VACE_TRANSFORMER_SPECIAL_KEYS_REMAP = {}
+ANIMATE_TRANSFORMER_SPECIAL_KEYS_REMAP = {
+    "motion_encoder": convert_animate_motion_encoder_weights,
+    "face_adapter": convert_animate_face_adapter_weights,
+}
+def update_state_dict_(state_dict: Dict[str, Any], old_key: str, new_key: str) -> Dict[str, Any]:
+    state_dict[new_key] = state_dict.pop(old_key)
+def load_sharded_safetensors(dir: pathlib.Path):
+    file_paths = list(dir.glob("diffusion_pytorch_model*.safetensors"))
+    state_dict = {}
+    for path in file_paths:
+        state_dict.update(load_file(path))
+    return state_dict
+def get_transformer_config(model_type: str) -> Tuple[Dict[str, Any], ...]:
+    if model_type == "Wan-T2V-1.3B":
+        config = {
+            "model_id": "StevenZhang/Wan2.1-T2V-1.3B-Diff",
+            "diffusers_config": {
+                "added_kv_proj_dim": None,
+                "attention_head_dim": 128,
+                "cross_attn_norm": True,
+                "eps": 1e-06,
+                "ffn_dim": 8960,
+                "freq_dim": 256,
+                "in_channels": 16,
+                "num_attention_heads": 12,
+                "num_layers": 30,
+                "out_channels": 16,
+                "patch_size": [1, 2, 2],
+                "qk_norm": "rms_norm_across_heads",
+                "text_dim": 4096,
+            },
+        }
+        RENAME_DICT = TRANSFORMER_KEYS_RENAME_DICT
+        SPECIAL_KEYS_REMAP = TRANSFORMER_SPECIAL_KEYS_REMAP
+    elif model_type == "Wan-T2V-14B":
+        config = {
+            "model_id": "StevenZhang/Wan2.1-T2V-14B-Diff",
+            "diffusers_config": {
+                "added_kv_proj_dim": None,
+                "attention_head_dim": 128,
+                "cross_attn_norm": True,
+                "eps": 1e-06,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "in_channels": 16,
+                "num_attention_heads": 40,
+                "num_layers": 40,
+                "out_channels": 16,
+                "patch_size": [1, 2, 2],
+                "qk_norm": "rms_norm_across_heads",
+                "text_dim": 4096,
+            },
+        }
+        RENAME_DICT = TRANSFORMER_KEYS_RENAME_DICT
+        SPECIAL_KEYS_REMAP = TRANSFORMER_SPECIAL_KEYS_REMAP
+    elif model_type == "Wan-I2V-14B-480p":
+        config = {
+            "model_id": "StevenZhang/Wan2.1-I2V-14B-480P-Diff",
+            "diffusers_config": {
+                "image_dim": 1280,
+                "added_kv_proj_dim": 5120,
+                "attention_head_dim": 128,
+                "cross_attn_norm": True,
+                "eps": 1e-06,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "in_channels": 36,
+                "num_attention_heads": 40,
+                "num_layers": 40,
+                "out_channels": 16,
+                "patch_size": [1, 2, 2],
+                "qk_norm": "rms_norm_across_heads",
+                "text_dim": 4096,
+            },
+        }
+        RENAME_DICT = TRANSFORMER_KEYS_RENAME_DICT
+        SPECIAL_KEYS_REMAP = TRANSFORMER_SPECIAL_KEYS_REMAP
+    elif model_type == "Wan-I2V-14B-720p":
+        config = {
+            "model_id": "StevenZhang/Wan2.1-I2V-14B-720P-Diff",
+            "diffusers_config": {
+                "image_dim": 1280,
+                "added_kv_proj_dim": 5120,
+                "attention_head_dim": 128,
+                "cross_attn_norm": True,
+                "eps": 1e-06,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "in_channels": 36,
+                "num_attention_heads": 40,
+                "num_layers": 40,
+                "out_channels": 16,
+                "patch_size": [1, 2, 2],
+                "qk_norm": "rms_norm_across_heads",
+                "text_dim": 4096,
+            },
+        }
+        RENAME_DICT = TRANSFORMER_KEYS_RENAME_DICT
+        SPECIAL_KEYS_REMAP = TRANSFORMER_SPECIAL_KEYS_REMAP
+    elif model_type == "Wan-FLF2V-14B-720P":
+        config = {
+            "model_id": "ypyp/Wan2.1-FLF2V-14B-720P",  # This is just a placeholder
+            "diffusers_config": {
+                "image_dim": 1280,
+                "added_kv_proj_dim": 5120,
+                "attention_head_dim": 128,
+                "cross_attn_norm": True,
+                "eps": 1e-06,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "in_channels": 36,
+                "num_attention_heads": 40,
+                "num_layers": 40,
+                "out_channels": 16,
+                "patch_size": [1, 2, 2],
+                "qk_norm": "rms_norm_across_heads",
+                "text_dim": 4096,
+                "rope_max_seq_len": 1024,
+                "pos_embed_seq_len": 257 * 2,
+            },
+        }
+        RENAME_DICT = TRANSFORMER_KEYS_RENAME_DICT
+        SPECIAL_KEYS_REMAP = TRANSFORMER_SPECIAL_KEYS_REMAP
+    elif model_type == "Wan-VACE-1.3B":
+        config = {
+            "model_id": "Wan-AI/Wan2.1-VACE-1.3B",
+            "diffusers_config": {
+                "added_kv_proj_dim": None,
+                "attention_head_dim": 128,
+                "cross_attn_norm": True,
+                "eps": 1e-06,
+                "ffn_dim": 8960,
+                "freq_dim": 256,
+                "in_channels": 16,
+                "num_attention_heads": 12,
+                "num_layers": 30,
+                "out_channels": 16,
+                "patch_size": [1, 2, 2],
+                "qk_norm": "rms_norm_across_heads",
+                "text_dim": 4096,
+                "vace_layers": [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28],
+                "vace_in_channels": 96,
+            },
+        }
+        RENAME_DICT = VACE_TRANSFORMER_KEYS_RENAME_DICT
+        SPECIAL_KEYS_REMAP = VACE_TRANSFORMER_SPECIAL_KEYS_REMAP
+    elif model_type == "Wan-VACE-14B":
+        config = {
+            "model_id": "Wan-AI/Wan2.1-VACE-14B",
+            "diffusers_config": {
+                "added_kv_proj_dim": None,
+                "attention_head_dim": 128,
+                "cross_attn_norm": True,
+                "eps": 1e-06,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "in_channels": 16,
+                "num_attention_heads": 40,
+                "num_layers": 40,
+                "out_channels": 16,
+                "patch_size": [1, 2, 2],
+                "qk_norm": "rms_norm_across_heads",
+                "text_dim": 4096,
+                "vace_layers": [0, 5, 10, 15, 20, 25, 30, 35],
+                "vace_in_channels": 96,
+            },
+        }
+        RENAME_DICT = VACE_TRANSFORMER_KEYS_RENAME_DICT
+        SPECIAL_KEYS_REMAP = VACE_TRANSFORMER_SPECIAL_KEYS_REMAP
+    elif model_type == "Wan2.2-VACE-Fun-14B":
+        config = {
+            "model_id": "alibaba-pai/Wan2.2-VACE-Fun-A14B",
+            "diffusers_config": {
+                "added_kv_proj_dim": None,
+                "attention_head_dim": 128,
+                "cross_attn_norm": True,
+                "eps": 1e-06,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "in_channels": 16,
+                "num_attention_heads": 40,
+                "num_layers": 40,
+                "out_channels": 16,
+                "patch_size": [1, 2, 2],
+                "qk_norm": "rms_norm_across_heads",
+                "text_dim": 4096,
+                "vace_layers": [0, 5, 10, 15, 20, 25, 30, 35],
+                "vace_in_channels": 96,
+            },
+        }
+        RENAME_DICT = VACE_TRANSFORMER_KEYS_RENAME_DICT
+        SPECIAL_KEYS_REMAP = VACE_TRANSFORMER_SPECIAL_KEYS_REMAP
+    elif model_type == "Wan2.2-I2V-14B-720p":
+        config = {
+            "model_id": "Wan-AI/Wan2.2-I2V-A14B",
+            "diffusers_config": {
+                "added_kv_proj_dim": None,
+                "attention_head_dim": 128,
+                "cross_attn_norm": True,
+                "eps": 1e-06,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "in_channels": 36,
+                "num_attention_heads": 40,
+                "num_layers": 40,
+                "out_channels": 16,
+                "patch_size": [1, 2, 2],
+                "qk_norm": "rms_norm_across_heads",
+                "text_dim": 4096,
+            },
+        }
+        RENAME_DICT = TRANSFORMER_KEYS_RENAME_DICT
+        SPECIAL_KEYS_REMAP = TRANSFORMER_SPECIAL_KEYS_REMAP
+    elif model_type == "Wan2.2-T2V-A14B":
+        config = {
+            "model_id": "Wan-AI/Wan2.2-T2V-A14B",
+            "diffusers_config": {
+                "added_kv_proj_dim": None,
+                "attention_head_dim": 128,
+                "cross_attn_norm": True,
+                "eps": 1e-06,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "in_channels": 16,
+                "num_attention_heads": 40,
+                "num_layers": 40,
+                "out_channels": 16,
+                "patch_size": [1, 2, 2],
+                "qk_norm": "rms_norm_across_heads",
+                "text_dim": 4096,
+            },
+        }
+        RENAME_DICT = TRANSFORMER_KEYS_RENAME_DICT
+        SPECIAL_KEYS_REMAP = TRANSFORMER_SPECIAL_KEYS_REMAP
+    elif model_type == "Wan2.2-TI2V-5B":
+        config = {
+            "model_id": "Wan-AI/Wan2.2-TI2V-5B",
+            "diffusers_config": {
+                "added_kv_proj_dim": None,
+                "attention_head_dim": 128,
+                "cross_attn_norm": True,
+                "eps": 1e-06,
+                "ffn_dim": 14336,
+                "freq_dim": 256,
+                "in_channels": 48,
+                "num_attention_heads": 24,
+                "num_layers": 30,
+                "out_channels": 48,
+                "patch_size": [1, 2, 2],
+                "qk_norm": "rms_norm_across_heads",
+                "text_dim": 4096,
+            },
+        }
+        RENAME_DICT = TRANSFORMER_KEYS_RENAME_DICT
+        SPECIAL_KEYS_REMAP = TRANSFORMER_SPECIAL_KEYS_REMAP
+    elif model_type == "Wan2.2-Animate-14B":
+        config = {
+            "model_id": "Wan-AI/Wan2.2-Animate-14B",
+            "diffusers_config": {
+                "image_dim": 1280,
+                "added_kv_proj_dim": 5120,
+                "attention_head_dim": 128,
+                "cross_attn_norm": True,
+                "eps": 1e-06,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "in_channels": 36,
+                "num_attention_heads": 40,
+                "num_layers": 40,
+                "out_channels": 16,
+                "patch_size": (1, 2, 2),
+                "qk_norm": "rms_norm_across_heads",
+                "text_dim": 4096,
+                "rope_max_seq_len": 1024,
+                "pos_embed_seq_len": None,
+                "motion_encoder_size": 512,  # Start of Wan Animate-specific configs
+                "motion_style_dim": 512,
+                "motion_dim": 20,
+                "motion_encoder_dim": 512,
+                "face_encoder_hidden_dim": 1024,
+                "face_encoder_num_heads": 4,
+                "inject_face_latents_blocks": 5,
+            },
+        }
+        RENAME_DICT = ANIMATE_TRANSFORMER_KEYS_RENAME_DICT
+        SPECIAL_KEYS_REMAP = ANIMATE_TRANSFORMER_SPECIAL_KEYS_REMAP
+    return config, RENAME_DICT, SPECIAL_KEYS_REMAP
+def convert_transformer(model_type: str, stage: str = None):
+    config, RENAME_DICT, SPECIAL_KEYS_REMAP = get_transformer_config(model_type)
+    diffusers_config = config["diffusers_config"]
+    model_id = config["model_id"]
+    model_dir = pathlib.Path(snapshot_download(model_id, repo_type="model"))
+    if stage is not None:
+        model_dir = model_dir / stage
+    original_state_dict = load_sharded_safetensors(model_dir)
+    with init_empty_weights():
+        if "Animate" in model_type:
+            transformer = WanAnimateTransformer3DModel.from_config(diffusers_config)
+        elif "VACE" in model_type:
+            transformer = WanVACETransformer3DModel.from_config(diffusers_config)
+        else:
+            transformer = WanTransformer3DModel.from_config(diffusers_config)
+    for key in list(original_state_dict.keys()):
+        new_key = key[:]
+        for replace_key, rename_key in RENAME_DICT.items():
+            new_key = new_key.replace(replace_key, rename_key)
+        update_state_dict_(original_state_dict, key, new_key)
+    for key in list(original_state_dict.keys()):
+        for special_key, handler_fn_inplace in SPECIAL_KEYS_REMAP.items():
+            if special_key not in key:
+                continue
+            handler_fn_inplace(key, original_state_dict)
+    # Load state dict into the meta model, which will materialize the tensors
+    transformer.load_state_dict(original_state_dict, strict=True, assign=True)
+    # Move to CPU to ensure all tensors are materialized
+    transformer = transformer.to("cpu")
+    return transformer
+def convert_vae():
+    vae_ckpt_path = hf_hub_download("Wan-AI/Wan2.1-T2V-14B", "Wan2.1_VAE.pth")
+    old_state_dict = torch.load(vae_ckpt_path, weights_only=True)
+    new_state_dict = {}
+    # Create mappings for specific components
+    middle_key_mapping = {
+        # Encoder middle block
+        "encoder.middle.0.residual.0.gamma": "encoder.mid_block.resnets.0.norm1.gamma",
+        "encoder.middle.0.residual.2.bias": "encoder.mid_block.resnets.0.conv1.bias",
+        "encoder.middle.0.residual.2.weight": "encoder.mid_block.resnets.0.conv1.weight",
+        "encoder.middle.0.residual.3.gamma": "encoder.mid_block.resnets.0.norm2.gamma",
+        "encoder.middle.0.residual.6.bias": "encoder.mid_block.resnets.0.conv2.bias",
+        "encoder.middle.0.residual.6.weight": "encoder.mid_block.resnets.0.conv2.weight",
+        "encoder.middle.2.residual.0.gamma": "encoder.mid_block.resnets.1.norm1.gamma",
+        "encoder.middle.2.residual.2.bias": "encoder.mid_block.resnets.1.conv1.bias",
+        "encoder.middle.2.residual.2.weight": "encoder.mid_block.resnets.1.conv1.weight",
+        "encoder.middle.2.residual.3.gamma": "encoder.mid_block.resnets.1.norm2.gamma",
+        "encoder.middle.2.residual.6.bias": "encoder.mid_block.resnets.1.conv2.bias",
+        "encoder.middle.2.residual.6.weight": "encoder.mid_block.resnets.1.conv2.weight",
+        # Decoder middle block
+        "decoder.middle.0.residual.0.gamma": "decoder.mid_block.resnets.0.norm1.gamma",
+        "decoder.middle.0.residual.2.bias": "decoder.mid_block.resnets.0.conv1.bias",
+        "decoder.middle.0.residual.2.weight": "decoder.mid_block.resnets.0.conv1.weight",
+        "decoder.middle.0.residual.3.gamma": "decoder.mid_block.resnets.0.norm2.gamma",
+        "decoder.middle.0.residual.6.bias": "decoder.mid_block.resnets.0.conv2.bias",
+        "decoder.middle.0.residual.6.weight": "decoder.mid_block.resnets.0.conv2.weight",
+        "decoder.middle.2.residual.0.gamma": "decoder.mid_block.resnets.1.norm1.gamma",
+        "decoder.middle.2.residual.2.bias": "decoder.mid_block.resnets.1.conv1.bias",
+        "decoder.middle.2.residual.2.weight": "decoder.mid_block.resnets.1.conv1.weight",
+        "decoder.middle.2.residual.3.gamma": "decoder.mid_block.resnets.1.norm2.gamma",
+        "decoder.middle.2.residual.6.bias": "decoder.mid_block.resnets.1.conv2.bias",
+        "decoder.middle.2.residual.6.weight": "decoder.mid_block.resnets.1.conv2.weight",
+    }
+    # Create a mapping for attention blocks
+    attention_mapping = {
+        # Encoder middle attention
+        "encoder.middle.1.norm.gamma": "encoder.mid_block.attentions.0.norm.gamma",
+        "encoder.middle.1.to_qkv.weight": "encoder.mid_block.attentions.0.to_qkv.weight",
+        "encoder.middle.1.to_qkv.bias": "encoder.mid_block.attentions.0.to_qkv.bias",
+        "encoder.middle.1.proj.weight": "encoder.mid_block.attentions.0.proj.weight",
+        "encoder.middle.1.proj.bias": "encoder.mid_block.attentions.0.proj.bias",
+        # Decoder middle attention
+        "decoder.middle.1.norm.gamma": "decoder.mid_block.attentions.0.norm.gamma",
+        "decoder.middle.1.to_qkv.weight": "decoder.mid_block.attentions.0.to_qkv.weight",
+        "decoder.middle.1.to_qkv.bias": "decoder.mid_block.attentions.0.to_qkv.bias",
+        "decoder.middle.1.proj.weight": "decoder.mid_block.attentions.0.proj.weight",
+        "decoder.middle.1.proj.bias": "decoder.mid_block.attentions.0.proj.bias",
+    }
+    # Create a mapping for the head components
+    head_mapping = {
+        # Encoder head
+        "encoder.head.0.gamma": "encoder.norm_out.gamma",
+        "encoder.head.2.bias": "encoder.conv_out.bias",
+        "encoder.head.2.weight": "encoder.conv_out.weight",
+        # Decoder head
+        "decoder.head.0.gamma": "decoder.norm_out.gamma",
+        "decoder.head.2.bias": "decoder.conv_out.bias",
+        "decoder.head.2.weight": "decoder.conv_out.weight",
+    }
+    # Create a mapping for the quant components
+    quant_mapping = {
+        "conv1.weight": "quant_conv.weight",
+        "conv1.bias": "quant_conv.bias",
+        "conv2.weight": "post_quant_conv.weight",
+        "conv2.bias": "post_quant_conv.bias",
+    }
+    # Process each key in the state dict
+    for key, value in old_state_dict.items():
+        # Handle middle block keys using the mapping
+        if key in middle_key_mapping:
+            new_key = middle_key_mapping[key]
+            new_state_dict[new_key] = value
+        # Handle attention blocks using the mapping
+        elif key in attention_mapping:
+            new_key = attention_mapping[key]
+            new_state_dict[new_key] = value
+        # Handle head keys using the mapping
+        elif key in head_mapping:
+            new_key = head_mapping[key]
+            new_state_dict[new_key] = value
+        # Handle quant keys using the mapping
+        elif key in quant_mapping:
+            new_key = quant_mapping[key]
+            new_state_dict[new_key] = value
+        # Handle encoder conv1
+        elif key == "encoder.conv1.weight":
+            new_state_dict["encoder.conv_in.weight"] = value
+        elif key == "encoder.conv1.bias":
+            new_state_dict["encoder.conv_in.bias"] = value
+        # Handle decoder conv1
+        elif key == "decoder.conv1.weight":
+            new_state_dict["decoder.conv_in.weight"] = value
+        elif key == "decoder.conv1.bias":
+            new_state_dict["decoder.conv_in.bias"] = value
+        # Handle encoder downsamples
+        elif key.startswith("encoder.downsamples."):
+            # Convert to down_blocks
+            new_key = key.replace("encoder.downsamples.", "encoder.down_blocks.")
+            # Convert residual block naming but keep the original structure
+            if ".residual.0.gamma" in new_key:
+                new_key = new_key.replace(".residual.0.gamma", ".norm1.gamma")
+            elif ".residual.2.bias" in new_key:
+                new_key = new_key.replace(".residual.2.bias", ".conv1.bias")
+            elif ".residual.2.weight" in new_key:
+                new_key = new_key.replace(".residual.2.weight", ".conv1.weight")
+            elif ".residual.3.gamma" in new_key:
+                new_key = new_key.replace(".residual.3.gamma", ".norm2.gamma")
+            elif ".residual.6.bias" in new_key:
+                new_key = new_key.replace(".residual.6.bias", ".conv2.bias")
+            elif ".residual.6.weight" in new_key:
+                new_key = new_key.replace(".residual.6.weight", ".conv2.weight")
+            elif ".shortcut.bias" in new_key:
+                new_key = new_key.replace(".shortcut.bias", ".conv_shortcut.bias")
+            elif ".shortcut.weight" in new_key:
+                new_key = new_key.replace(".shortcut.weight", ".conv_shortcut.weight")
+            new_state_dict[new_key] = value
+        # Handle decoder upsamples
+        elif key.startswith("decoder.upsamples."):
+            # Convert to up_blocks
+            parts = key.split(".")
+            block_idx = int(parts[2])
+            # Group residual blocks
+            if "residual" in key:
+                if block_idx in [0, 1, 2]:
+                    new_block_idx = 0
+                    resnet_idx = block_idx
+                elif block_idx in [4, 5, 6]:
+                    new_block_idx = 1
+                    resnet_idx = block_idx - 4
+                elif block_idx in [8, 9, 10]:
+                    new_block_idx = 2
+                    resnet_idx = block_idx - 8
+                elif block_idx in [12, 13, 14]:
+                    new_block_idx = 3
+                    resnet_idx = block_idx - 12
+                else:
+                    # Keep as is for other blocks
+                    new_state_dict[key] = value
+                    continue
+                # Convert residual block naming
+                if ".residual.0.gamma" in key:
+                    new_key = f"decoder.up_blocks.{new_block_idx}.resnets.{resnet_idx}.norm1.gamma"
+                elif ".residual.2.bias" in key:
+                    new_key = f"decoder.up_blocks.{new_block_idx}.resnets.{resnet_idx}.conv1.bias"
+                elif ".residual.2.weight" in key:
+                    new_key = f"decoder.up_blocks.{new_block_idx}.resnets.{resnet_idx}.conv1.weight"
+                elif ".residual.3.gamma" in key:
+                    new_key = f"decoder.up_blocks.{new_block_idx}.resnets.{resnet_idx}.norm2.gamma"
+                elif ".residual.6.bias" in key:
+                    new_key = f"decoder.up_blocks.{new_block_idx}.resnets.{resnet_idx}.conv2.bias"
+                elif ".residual.6.weight" in key:
+                    new_key = f"decoder.up_blocks.{new_block_idx}.resnets.{resnet_idx}.conv2.weight"
+                else:
+                    new_key = key
+                new_state_dict[new_key] = value
+            # Handle shortcut connections
+            elif ".shortcut." in key:
+                if block_idx == 4:
+                    new_key = key.replace(".shortcut.", ".resnets.0.conv_shortcut.")
+                    new_key = new_key.replace("decoder.upsamples.4", "decoder.up_blocks.1")
+                else:
+                    new_key = key.replace("decoder.upsamples.", "decoder.up_blocks.")
+                    new_key = new_key.replace(".shortcut.", ".conv_shortcut.")
+                new_state_dict[new_key] = value
+            # Handle upsamplers
+            elif ".resample." in key or ".time_conv." in key:
+                if block_idx == 3:
+                    new_key = key.replace(f"decoder.upsamples.{block_idx}", "decoder.up_blocks.0.upsamplers.0")
+                elif block_idx == 7:
+                    new_key = key.replace(f"decoder.upsamples.{block_idx}", "decoder.up_blocks.1.upsamplers.0")
+                elif block_idx == 11:
+                    new_key = key.replace(f"decoder.upsamples.{block_idx}", "decoder.up_blocks.2.upsamplers.0")
+                else:
+                    new_key = key.replace("decoder.upsamples.", "decoder.up_blocks.")
+                new_state_dict[new_key] = value
+            else:
+                new_key = key.replace("decoder.upsamples.", "decoder.up_blocks.")
+                new_state_dict[new_key] = value
+        else:
+            # Keep other keys unchanged
+            new_state_dict[key] = value
+    with init_empty_weights():
+        vae = AutoencoderKLWan()
+    vae.load_state_dict(new_state_dict, strict=True, assign=True)
+    return vae
+vae22_diffusers_config = {
+    "base_dim": 160,
+    "z_dim": 48,
+    "is_residual": True,
+    "in_channels": 12,
+    "out_channels": 12,
+    "decoder_base_dim": 256,
+    "scale_factor_temporal": 4,
+    "scale_factor_spatial": 16,
+    "patch_size": 2,
+    "latents_mean": [
+        -0.2289,
+        -0.0052,
+        -0.1323,
+        -0.2339,
+        -0.2799,
+        0.0174,
+        0.1838,
+        0.1557,
+        -0.1382,
+        0.0542,
+        0.2813,
+        0.0891,
+        0.1570,
+        -0.0098,
+        0.0375,
+        -0.1825,
+        -0.2246,
+        -0.1207,
+        -0.0698,
+        0.5109,
+        0.2665,
+        -0.2108,
+        -0.2158,
+        0.2502,
+        -0.2055,
+        -0.0322,
+        0.1109,
+        0.1567,
+        -0.0729,
+        0.0899,
+        -0.2799,
+        -0.1230,
+        -0.0313,
+        -0.1649,
+        0.0117,
+        0.0723,
+        -0.2839,
+        -0.2083,
+        -0.0520,
+        0.3748,
+        0.0152,
+        0.1957,
+        0.1433,
+        -0.2944,
+        0.3573,
+        -0.0548,
+        -0.1681,
+        -0.0667,
+    ],
+    "latents_std": [
+        0.4765,
+        1.0364,
+        0.4514,
+        1.1677,
+        0.5313,
+        0.4990,
+        0.4818,
+        0.5013,
+        0.8158,
+        1.0344,
+        0.5894,
+        1.0901,
+        0.6885,
+        0.6165,
+        0.8454,
+        0.4978,
+        0.5759,
+        0.3523,
+        0.7135,
+        0.6804,
+        0.5833,
+        1.4146,
+        0.8986,
+        0.5659,
+        0.7069,
+        0.5338,
+        0.4889,
+        0.4917,
+        0.4069,
+        0.4999,
+        0.6866,
+        0.4093,
+        0.5709,
+        0.6065,
+        0.6415,
+        0.4944,
+        0.5726,
+        1.2042,
+        0.5458,
+        1.6887,
+        0.3971,
+        1.0600,
+        0.3943,
+        0.5537,
+        0.5444,
+        0.4089,
+        0.7468,
+        0.7744,
+    ],
+    "clip_output": False,
+}
+def convert_vae_22():
+    vae_ckpt_path = hf_hub_download("Wan-AI/Wan2.2-TI2V-5B", "Wan2.2_VAE.pth")
+    old_state_dict = torch.load(vae_ckpt_path, weights_only=True)
+    new_state_dict = {}
+    # Create mappings for specific components
+    middle_key_mapping = {
+        # Encoder middle block
+        "encoder.middle.0.residual.0.gamma": "encoder.mid_block.resnets.0.norm1.gamma",
+        "encoder.middle.0.residual.2.bias": "encoder.mid_block.resnets.0.conv1.bias",
+        "encoder.middle.0.residual.2.weight": "encoder.mid_block.resnets.0.conv1.weight",
+        "encoder.middle.0.residual.3.gamma": "encoder.mid_block.resnets.0.norm2.gamma",
+        "encoder.middle.0.residual.6.bias": "encoder.mid_block.resnets.0.conv2.bias",
+        "encoder.middle.0.residual.6.weight": "encoder.mid_block.resnets.0.conv2.weight",
+        "encoder.middle.2.residual.0.gamma": "encoder.mid_block.resnets.1.norm1.gamma",
+        "encoder.middle.2.residual.2.bias": "encoder.mid_block.resnets.1.conv1.bias",
+        "encoder.middle.2.residual.2.weight": "encoder.mid_block.resnets.1.conv1.weight",
+        "encoder.middle.2.residual.3.gamma": "encoder.mid_block.resnets.1.norm2.gamma",
+        "encoder.middle.2.residual.6.bias": "encoder.mid_block.resnets.1.conv2.bias",
+        "encoder.middle.2.residual.6.weight": "encoder.mid_block.resnets.1.conv2.weight",
+        # Decoder middle block
+        "decoder.middle.0.residual.0.gamma": "decoder.mid_block.resnets.0.norm1.gamma",
+        "decoder.middle.0.residual.2.bias": "decoder.mid_block.resnets.0.conv1.bias",
+        "decoder.middle.0.residual.2.weight": "decoder.mid_block.resnets.0.conv1.weight",
+        "decoder.middle.0.residual.3.gamma": "decoder.mid_block.resnets.0.norm2.gamma",
+        "decoder.middle.0.residual.6.bias": "decoder.mid_block.resnets.0.conv2.bias",
+        "decoder.middle.0.residual.6.weight": "decoder.mid_block.resnets.0.conv2.weight",
+        "decoder.middle.2.residual.0.gamma": "decoder.mid_block.resnets.1.norm1.gamma",
+        "decoder.middle.2.residual.2.bias": "decoder.mid_block.resnets.1.conv1.bias",
+        "decoder.middle.2.residual.2.weight": "decoder.mid_block.resnets.1.conv1.weight",
+        "decoder.middle.2.residual.3.gamma": "decoder.mid_block.resnets.1.norm2.gamma",
+        "decoder.middle.2.residual.6.bias": "decoder.mid_block.resnets.1.conv2.bias",
+        "decoder.middle.2.residual.6.weight": "decoder.mid_block.resnets.1.conv2.weight",
+    }
+    # Create a mapping for attention blocks
+    attention_mapping = {
+        # Encoder middle attention
+        "encoder.middle.1.norm.gamma": "encoder.mid_block.attentions.0.norm.gamma",
+        "encoder.middle.1.to_qkv.weight": "encoder.mid_block.attentions.0.to_qkv.weight",
+        "encoder.middle.1.to_qkv.bias": "encoder.mid_block.attentions.0.to_qkv.bias",
+        "encoder.middle.1.proj.weight": "encoder.mid_block.attentions.0.proj.weight",
+        "encoder.middle.1.proj.bias": "encoder.mid_block.attentions.0.proj.bias",
+        # Decoder middle attention
+        "decoder.middle.1.norm.gamma": "decoder.mid_block.attentions.0.norm.gamma",
+        "decoder.middle.1.to_qkv.weight": "decoder.mid_block.attentions.0.to_qkv.weight",
+        "decoder.middle.1.to_qkv.bias": "decoder.mid_block.attentions.0.to_qkv.bias",
+        "decoder.middle.1.proj.weight": "decoder.mid_block.attentions.0.proj.weight",
+        "decoder.middle.1.proj.bias": "decoder.mid_block.attentions.0.proj.bias",
+    }
+    # Create a mapping for the head components
+    head_mapping = {
+        # Encoder head
+        "encoder.head.0.gamma": "encoder.norm_out.gamma",
+        "encoder.head.2.bias": "encoder.conv_out.bias",
+        "encoder.head.2.weight": "encoder.conv_out.weight",
+        # Decoder head
+        "decoder.head.0.gamma": "decoder.norm_out.gamma",
+        "decoder.head.2.bias": "decoder.conv_out.bias",
+        "decoder.head.2.weight": "decoder.conv_out.weight",
+    }
+    # Create a mapping for the quant components
+    quant_mapping = {
+        "conv1.weight": "quant_conv.weight",
+        "conv1.bias": "quant_conv.bias",
+        "conv2.weight": "post_quant_conv.weight",
+        "conv2.bias": "post_quant_conv.bias",
+    }
+    # Process each key in the state dict
+    for key, value in old_state_dict.items():
+        # Handle middle block keys using the mapping
+        if key in middle_key_mapping:
+            new_key = middle_key_mapping[key]
+            new_state_dict[new_key] = value
+        # Handle attention blocks using the mapping
+        elif key in attention_mapping:
+            new_key = attention_mapping[key]
+            new_state_dict[new_key] = value
+        # Handle head keys using the mapping
+        elif key in head_mapping:
+            new_key = head_mapping[key]
+            new_state_dict[new_key] = value
+        # Handle quant keys using the mapping
+        elif key in quant_mapping:
+            new_key = quant_mapping[key]
+            new_state_dict[new_key] = value
+        # Handle encoder conv1
+        elif key == "encoder.conv1.weight":
+            new_state_dict["encoder.conv_in.weight"] = value
+        elif key == "encoder.conv1.bias":
+            new_state_dict["encoder.conv_in.bias"] = value
+        # Handle decoder conv1
+        elif key == "decoder.conv1.weight":
+            new_state_dict["decoder.conv_in.weight"] = value
+        elif key == "decoder.conv1.bias":
+            new_state_dict["decoder.conv_in.bias"] = value
+        # Handle encoder downsamples
+        elif key.startswith("encoder.downsamples."):
+            # Change encoder.downsamples to encoder.down_blocks
+            new_key = key.replace("encoder.downsamples.", "encoder.down_blocks.")
+            # Handle residual blocks - change downsamples to resnets and rename components
+            if "residual" in new_key or "shortcut" in new_key:
+                # Change the second downsamples to resnets
+                new_key = new_key.replace(".downsamples.", ".resnets.")
+                # Rename residual components
+                if ".residual.0.gamma" in new_key:
+                    new_key = new_key.replace(".residual.0.gamma", ".norm1.gamma")
+                elif ".residual.2.weight" in new_key:
+                    new_key = new_key.replace(".residual.2.weight", ".conv1.weight")
+                elif ".residual.2.bias" in new_key:
+                    new_key = new_key.replace(".residual.2.bias", ".conv1.bias")
+                elif ".residual.3.gamma" in new_key:
+                    new_key = new_key.replace(".residual.3.gamma", ".norm2.gamma")
+                elif ".residual.6.weight" in new_key:
+                    new_key = new_key.replace(".residual.6.weight", ".conv2.weight")
+                elif ".residual.6.bias" in new_key:
+                    new_key = new_key.replace(".residual.6.bias", ".conv2.bias")
+                elif ".shortcut.weight" in new_key:
+                    new_key = new_key.replace(".shortcut.weight", ".conv_shortcut.weight")
+                elif ".shortcut.bias" in new_key:
+                    new_key = new_key.replace(".shortcut.bias", ".conv_shortcut.bias")
+            # Handle resample blocks - change downsamples to downsampler and remove index
+            elif "resample" in new_key or "time_conv" in new_key:
+                # Change the second downsamples to downsampler and remove the index
+                parts = new_key.split(".")
+                # Find the pattern: encoder.down_blocks.X.downsamples.Y.resample...
+                # We want to change it to: encoder.down_blocks.X.downsampler.resample...
+                if len(parts) >= 4 and parts[3] == "downsamples":
+                    # Remove the index (parts[4]) and change downsamples to downsampler
+                    new_parts = parts[:3] + ["downsampler"] + parts[5:]
+                    new_key = ".".join(new_parts)
+            new_state_dict[new_key] = value
+        # Handle decoder upsamples
+        elif key.startswith("decoder.upsamples."):
+            # Change decoder.upsamples to decoder.up_blocks
+            new_key = key.replace("decoder.upsamples.", "decoder.up_blocks.")
+            # Handle residual blocks - change upsamples to resnets and rename components
+            if "residual" in new_key or "shortcut" in new_key:
+                # Change the second upsamples to resnets
+                new_key = new_key.replace(".upsamples.", ".resnets.")
+                # Rename residual components
+                if ".residual.0.gamma" in new_key:
+                    new_key = new_key.replace(".residual.0.gamma", ".norm1.gamma")
+                elif ".residual.2.weight" in new_key:
+                    new_key = new_key.replace(".residual.2.weight", ".conv1.weight")
+                elif ".residual.2.bias" in new_key:
+                    new_key = new_key.replace(".residual.2.bias", ".conv1.bias")
+                elif ".residual.3.gamma" in new_key:
+                    new_key = new_key.replace(".residual.3.gamma", ".norm2.gamma")
+                elif ".residual.6.weight" in new_key:
+                    new_key = new_key.replace(".residual.6.weight", ".conv2.weight")
+                elif ".residual.6.bias" in new_key:
+                    new_key = new_key.replace(".residual.6.bias", ".conv2.bias")
+                elif ".shortcut.weight" in new_key:
+                    new_key = new_key.replace(".shortcut.weight", ".conv_shortcut.weight")
+                elif ".shortcut.bias" in new_key:
+                    new_key = new_key.replace(".shortcut.bias", ".conv_shortcut.bias")
+            # Handle resample blocks - change upsamples to upsampler and remove index
+            elif "resample" in new_key or "time_conv" in new_key:
+                # Change the second upsamples to upsampler and remove the index
+                parts = new_key.split(".")
+                # Find the pattern: encoder.down_blocks.X.downsamples.Y.resample...
+                # We want to change it to: encoder.down_blocks.X.downsampler.resample...
+                if len(parts) >= 4 and parts[3] == "upsamples":
+                    # Remove the index (parts[4]) and change upsamples to upsampler
+                    new_parts = parts[:3] + ["upsampler"] + parts[5:]
+                    new_key = ".".join(new_parts)
+            new_state_dict[new_key] = value
+        else:
+            # Keep other keys unchanged
+            new_state_dict[key] = value
+    with init_empty_weights():
+        vae = AutoencoderKLWan(**vae22_diffusers_config)
+    vae.load_state_dict(new_state_dict, strict=True, assign=True)
+    return vae
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_type", type=str, default=None)
+    parser.add_argument("--output_path", type=str, required=True)
+    parser.add_argument("--dtype", default="fp32", choices=["fp32", "fp16", "bf16", "none"])
+    return parser.parse_args()
+DTYPE_MAPPING = {
+    "fp32": torch.float32,
+    "fp16": torch.float16,
+    "bf16": torch.bfloat16,
+}
+if __name__ == "__main__":
+    args = get_args()
+    if "Wan2.2" in args.model_type and "TI2V" not in args.model_type and "Animate" not in args.model_type:
+        transformer = convert_transformer(args.model_type, stage="high_noise_model")
+        transformer_2 = convert_transformer(args.model_type, stage="low_noise_model")
+    else:
+        transformer = convert_transformer(args.model_type)
+        transformer_2 = None
+    if "Wan2.2" in args.model_type and "TI2V" in args.model_type:
+        vae = convert_vae_22()
+    else:
+        vae = convert_vae()
+    text_encoder = UMT5EncoderModel.from_pretrained("google/umt5-xxl", torch_dtype=torch.bfloat16)
+    tokenizer = AutoTokenizer.from_pretrained("google/umt5-xxl")
+    if "FLF2V" in args.model_type:
+        flow_shift = 16.0
+    elif "TI2V" in args.model_type or "Animate" in args.model_type:
+        flow_shift = 5.0
+    else:
+        flow_shift = 3.0
+    scheduler = UniPCMultistepScheduler(
+        prediction_type="flow_prediction", use_flow_sigmas=True, num_train_timesteps=1000, flow_shift=flow_shift
+    )
+    # If user has specified "none", we keep the original dtypes of the state dict without any conversion
+    if args.dtype != "none":
+        dtype = DTYPE_MAPPING[args.dtype]
+        transformer.to(dtype)
+        if transformer_2 is not None:
+            transformer_2.to(dtype)
+    if "Wan2.2" and "I2V" in args.model_type and "TI2V" not in args.model_type:
+        pipe = WanImageToVideoPipeline(
+            transformer=transformer,
+            transformer_2=transformer_2,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            vae=vae,
+            scheduler=scheduler,
+            boundary_ratio=0.9,
+        )
+    elif "Wan2.2" and "T2V" in args.model_type:
+        pipe = WanPipeline(
+            transformer=transformer,
+            transformer_2=transformer_2,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            vae=vae,
+            scheduler=scheduler,
+            boundary_ratio=0.875,
+        )
+    elif "Wan2.2" and "TI2V" in args.model_type:
+        pipe = WanPipeline(
+            transformer=transformer,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            vae=vae,
+            scheduler=scheduler,
+            expand_timesteps=True,
+        )
+    elif "I2V" in args.model_type or "FLF2V" in args.model_type:
+        image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+            "laion/CLIP-ViT-H-14-laion2B-s32B-b79K", torch_dtype=torch.bfloat16
+        )
+        image_processor = AutoProcessor.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K")
+        pipe = WanImageToVideoPipeline(
+            transformer=transformer,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            vae=vae,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            image_processor=image_processor,
+        )
+    elif "Wan2.2-VACE" in args.model_type:
+        pipe = WanVACEPipeline(
+            transformer=transformer,
+            transformer_2=transformer_2,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            vae=vae,
+            scheduler=scheduler,
+            boundary_ratio=0.875,
+        )
+    elif "Wan-VACE" in args.model_type:
+        pipe = WanVACEPipeline(
+            transformer=transformer,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            vae=vae,
+            scheduler=scheduler,
+        )
+    elif "Animate" in args.model_type:
+        image_encoder = CLIPVisionModel.from_pretrained(
+            "laion/CLIP-ViT-H-14-laion2B-s32B-b79K", torch_dtype=torch.bfloat16
+        )
+        image_processor = CLIPImageProcessor.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K")
+        pipe = WanAnimatePipeline(
+            transformer=transformer,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            vae=vae,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            image_processor=image_processor,
+        )
+    else:
+        pipe = WanPipeline(
+            transformer=transformer,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            vae=vae,
+            scheduler=scheduler,
+        )
+    pipe.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")

sudoku/generate_dataset.py ADDED Viewed

	@@ -0,0 +1,424 @@

+"""
+Sudoku Video Dataset Generator - Supports flexible solution count expressions per puzzle.
+With checkpoint/resume support via metadata.json.
+"""
+import json
+import re
+import random
+import argparse
+from dataclasses import dataclass, asdict
+from pathlib import Path
+from typing import List, Tuple, Optional, Union, Dict, Any
+import numpy as np
+import cv2
+from tqdm import tqdm
+from sudoku_processor import SudokuProcessor
+# ==================== Solution Range ====================
+@dataclass
+class SolRange:
+    """Flexible solution count constraint for puzzle generation."""
+    min_sol: int
+    max_sol: Optional[int]
+    @classmethod
+    def parse(cls, expr: str) -> "SolRange":
+        expr = expr.strip()
+        m = re.fullmatch(r'(\d+)\s*-\s*(\d+)', expr)
+        if m:
+            lo, hi = int(m.group(1)), int(m.group(2))
+            if lo < 1: raise ValueError(f"min_sol must be >= 1, got {lo}")
+            if hi < lo: raise ValueError(f"Invalid range: {lo}-{hi}")
+            return cls(min_sol=lo, max_sol=hi)
+        m = re.fullmatch(r'(>=|>|<=|<|==)\s*(\d+)', expr)
+        if m:
+            op, n = m.group(1), int(m.group(2))
+            if op == '>=': return cls(min_sol=max(1, n), max_sol=None)
+            elif op == '>': return cls(min_sol=max(1, n + 1), max_sol=None)
+            elif op == '<=': return cls(min_sol=1, max_sol=n)
+            elif op == '<': return cls(min_sol=1, max_sol=max(1, n - 1))
+            elif op == '==': return cls(min_sol=n, max_sol=n)
+        m = re.fullmatch(r'(\d+)', expr)
+        if m:
+            n = int(m.group(1))
+            if n < 1: raise ValueError(f"sol_num must be >= 1, got {n}")
+            return cls(min_sol=n, max_sol=n)
+        raise ValueError(f"Invalid sol_num expression: '{expr}'")
+    @property
+    def is_exact(self): return self.max_sol is not None and self.min_sol == self.max_sol
+    @property
+    def is_unique_only(self): return self.is_exact and self.min_sol == 1
+    @property
+    def allows_unique(self): return self.min_sol <= 1
+    @property
+    def requires_multi(self): return self.min_sol > 1
+    @property
+    def effective_max(self): return self.max_sol if self.max_sol is not None else max(self.min_sol, 10)
+    def accepts(self, count):
+        if count < self.min_sol: return False
+        if self.max_sol is not None and count > self.max_sol: return False
+        return True
+    def __repr__(self):
+        if self.is_exact: return f"SolRange(=={self.min_sol})"
+        if self.max_sol is None: return f"SolRange(>={self.min_sol})"
+        return f"SolRange({self.min_sol}-{self.max_sol})"
+# ==================== Checkpoint Management ====================
+@dataclass
+class GenerationState:
+    """Tracks generation progress for checkpoint/resume."""
+    params_hash: str
+    clue_progress: Dict[int, int]  # clue_level -> generated_count
+    seen_grids: List[str]
+    all_samples: List[Dict]
+    completed: bool = False
+    def to_dict(self) -> Dict:
+        return asdict(self)
+    @classmethod
+    def from_dict(cls, d: Dict) -> "GenerationState":
+        return cls(**d)
+def compute_params_hash(params: Dict) -> str:
+    """Compute hash of generation parameters for consistency check."""
+    import hashlib
+    # Only hash parameters that affect generation logic
+    key_params = {k: v for k, v in params.items()
+                  if k not in ['output_dir']}  # output_dir can differ
+    return hashlib.md5(json.dumps(key_params, sort_keys=True).encode()).hexdigest()[:12]
+def load_checkpoint(output_dir: Path, params: Dict) -> Optional[GenerationState]:
+    """Load checkpoint if exists and params match."""
+    meta_path = output_dir / "metadata.json"
+    if not meta_path.exists():
+        return None
+    with open(meta_path) as f:
+        data = json.load(f)
+    state = GenerationState.from_dict(data["state"])
+    expected_hash = compute_params_hash(params)
+    if state.params_hash != expected_hash:
+        print(f"⚠️  Parameters changed (hash {state.params_hash} → {expected_hash}), starting fresh")
+        return None
+    if state.completed:
+        print("✓ Generation already completed")
+        return state
+    print(f"✓ Resuming from checkpoint: {sum(state.clue_progress.values())} puzzles generated")
+    return state
+def save_checkpoint(output_dir: Path, state: GenerationState, params: Dict):
+    """Save current generation state to metadata.json."""
+    meta_path = output_dir / "metadata.json"
+    data = {
+        "params": params,
+        "state": state.to_dict()
+    }
+    # Atomic write
+    tmp_path = meta_path.with_suffix('.tmp')
+    with open(tmp_path, 'w') as f:
+        json.dump(data, f, indent=2)
+    tmp_path.rename(meta_path)
+# ==================== Core Functions ====================
+def get_fill_order(puzzle, solution):
+    return [(i, j, solution[i][j]) for i in range(9) for j in range(9) if puzzle[i][j] == 0]
+def create_processor(resolution=None):
+    if resolution is None: return SudokuProcessor()
+    target_size = min(resolution)
+    cell_size = target_size // 9
+    sf = cell_size / 60
+    return SudokuProcessor(cell_size=cell_size, font_scale=1.2*sf, thickness=max(1, int(2*sf)))
+def generate_video_frames(proc, puzzle, solution, n_start, m_end, k=1, max_frames=None):
+    fills = get_fill_order(puzzle, solution)
+    n_fills = len(fills)
+    effective_k = k
+    if max_frames is not None and n_start + n_fills * k + m_end > max_frames:
+        avail = max_frames - n_start - m_end
+        effective_k = max(1, avail // n_fills) if avail > 0 and n_fills > 0 else 1
+    frames = []
+    current = [row[:] for row in puzzle]
+    img = proc.render(current)
+    frames.extend([img.copy() for _ in range(n_start)])
+    for r, c, v in fills:
+        current[r][c] = v
+        frames.append(proc.render(current, highlight_new=(r, c), original=puzzle))
+        if effective_k > 1:
+            img = proc.render(current, original=puzzle)
+            frames.extend([img.copy() for _ in range(effective_k - 1)])
+    img = proc.render(solution, original=puzzle)
+    frames.extend([img.copy() for _ in range(m_end)])
+    if max_frames is not None and len(frames) > max_frames:
+        frames = frames[:max_frames]
+    return frames
+def save_video(frames, path, fps=10):
+    h, w = frames[0].shape[:2]
+    writer = cv2.VideoWriter(str(path), cv2.VideoWriter_fourcc(*'mp4v'), fps, (w, h))
+    for f in frames: writer.write(cv2.cvtColor(f, cv2.COLOR_RGB2BGR))
+    writer.release()
+def normalize_num_per_clue(num_per_clue, clue_levels):
+    if isinstance(num_per_clue, int): return [num_per_clue] * len(clue_levels)
+    if len(num_per_clue) != len(clue_levels):
+        raise ValueError(f"num_per_clue length ({len(num_per_clue)}) != clue_levels ({len(clue_levels)})")
+    return num_per_clue
+# ==================== Puzzle Generation with SolRange ====================
+def generate_puzzle_with_range(proc, clue, sol_range, min_hamming):
+    """Generate one puzzle respecting sol_range. Returns (puzzle, solutions) or None."""
+    if sol_range.is_unique_only:
+        puzzle, solution = proc.generate(clue, unique=True)
+        return puzzle, [solution]
+    if sol_range.requires_multi:
+        try:
+            puzzle, solutions = proc.generate_multi_solution(
+                clue, min_solutions=sol_range.min_sol,
+                max_solutions=sol_range.effective_max,
+                max_attempts=1, min_hamming=min_hamming
+            )
+            if sol_range.accepts(len(solutions)):
+                return puzzle, solutions
+        except RuntimeError:
+            pass
+        return None
+    try:
+        puzzle, solutions = proc.generate_multi_solution(
+            clue, min_solutions=max(2, sol_range.min_sol),
+            max_solutions=sol_range.effective_max,
+            max_attempts=1, min_hamming=min_hamming
+        )
+        if sol_range.accepts(len(solutions)):
+            return puzzle, solutions
+    except RuntimeError:
+        pass
+    if sol_range.allows_unique:
+        puzzle, solution = proc.generate(clue, unique=True)
+        return puzzle, [solution]
+    return None
+# ==================== Dataset Generation ====================
+def generate_dataset(
+    output_dir="sudoku_video", clue_levels=[30,40,50,60], num_per_clue=50,
+    sol_num="1", min_hamming=10, train_ratio=0.8,
+    prompt="Solve this Sudoku puzzle using red font.",
+    n_start=10, m_end=10, k=1, max_frames=None, fps=10,
+    resolution=None, seed=42, checkpoint_interval=50
+):
+    """
+    Generate Sudoku video dataset with checkpoint/resume support.
+    Args:
+        checkpoint_interval: Save checkpoint every N puzzles (default: 50)
+    """
+    # Prepare params dict for hashing
+    params = {
+        "clue_levels": clue_levels, "num_per_clue": num_per_clue,
+        "sol_num": sol_num, "min_hamming": min_hamming, "train_ratio": train_ratio,
+        "prompt": prompt, "n_start": n_start, "m_end": m_end, "k": k,
+        "max_frames": max_frames, "fps": fps, "resolution": resolution, "seed": seed
+    }
+    output_dir = Path(output_dir)
+    video_dir = output_dir / "videos"
+    image_dir = output_dir / "images"
+    video_dir.mkdir(parents=True, exist_ok=True)
+    image_dir.mkdir(parents=True, exist_ok=True)
+    # Try to resume from checkpoint
+    state = load_checkpoint(output_dir, params)
+    if state and state.completed:
+        return  # Already done
+    sol_range = SolRange.parse(str(sol_num))
+    proc = create_processor(resolution)
+    actual_size = proc.img_size
+    num_per_clue_list = normalize_num_per_clue(num_per_clue, clue_levels)
+    max_puzzles = max(num_per_clue_list)
+    num_width = len(str(max_puzzles))
+    # Initialize or restore state
+    if state is None:
+        random.seed(seed)
+        state = GenerationState(
+            params_hash=compute_params_hash(params),
+            clue_progress={clue: 0 for clue in clue_levels},
+            seen_grids=[],
+            all_samples=[]
+        )
+        print(f"Starting fresh generation with solution range: {sol_range}")
+    else:
+        # Restore RNG state approximately by fast-forwarding
+        random.seed(seed)
+        for _ in range(sum(state.clue_progress.values()) * 10):
+            random.random()
+    seen_grids = set(state.seen_grids)
+    all_samples = state.all_samples.copy()
+    clue_progress = {int(k): v for k, v in state.clue_progress.items()}
+    total_target = sum(num_per_clue_list)
+    total_done = sum(clue_progress.values())
+    stats_unique = sum(1 for s in all_samples if s["total_solutions"] == 1 and s["sol_idx"] == 0)
+    stats_multi = sum(1 for s in all_samples if s["total_solutions"] > 1 and s["sol_idx"] == 0)
+    puzzles_since_checkpoint = 0
+    with tqdm(total=total_target, initial=total_done, desc="Total", unit="puzzle") as pbar_total:
+        for clue, target_count in zip(clue_levels, num_per_clue_list):
+            generated = clue_progress.get(clue, 0)
+            if generated >= target_count:
+                continue  # This clue level is done
+            max_attempts = (target_count - generated) * 20
+            with tqdm(total=target_count, initial=generated, desc=f"Clue {clue:2d}",
+                      unit="puzzle", leave=False) as pbar_clue:
+                for _ in range(max_attempts):
+                    if generated >= target_count:
+                        break
+                    result = generate_puzzle_with_range(proc, clue, sol_range, min_hamming)
+                    if result is None:
+                        continue
+                    puzzle, solutions = result
+                    fp = proc.encode(puzzle)
+                    if fp in seen_grids:
+                        continue
+                    seen_grids.add(fp)
+                    n_sols = len(solutions)
+                    if n_sols == 1:
+                        stats_unique += 1
+                    else:
+                        stats_multi += 1
+                    img_name = f"clue{clue}_{generated:0{num_width}d}.png"
+                    puzzle_img = proc.render(puzzle)
+                    cv2.imwrite(str(image_dir / img_name), cv2.cvtColor(puzzle_img, cv2.COLOR_RGB2BGR))
+                    for si, sol in enumerate(solutions):
+                        vid_name = f"clue{clue}_{generated:0{num_width}d}_sol{si}.mp4"
+                        frames = generate_video_frames(proc, puzzle, sol, n_start, m_end, k, max_frames)
+                        save_video(frames, video_dir / vid_name, fps)
+                        hdists = [proc._hamming(sol, solutions[j]) for j in range(n_sols) if j != si]
+                        all_samples.append({
+                            "prompt": prompt, "video": vid_name, "image": img_name,
+                            "clue": clue, "puzzle": fp, "solution": proc.encode(sol),
+                            "sol_idx": si, "total_solutions": n_sols,
+                            "frame_count": len(frames),
+                            "min_hamming_to_others": min(hdists) if hdists else 0
+                        })
+                    generated += 1
+                    clue_progress[clue] = generated
+                    puzzles_since_checkpoint += 1
+                    pbar_clue.update(1)
+                    pbar_total.update(1)
+                    # Periodic checkpoint
+                    if puzzles_since_checkpoint >= checkpoint_interval:
+                        state.clue_progress = clue_progress
+                        state.seen_grids = list(seen_grids)
+                        state.all_samples = all_samples
+                        save_checkpoint(output_dir, state, params)
+                        puzzles_since_checkpoint = 0
+            tqdm.write(f"Clue {clue}: {generated} puzzles, "
+                       f"{sum(1 for s in all_samples if s['clue'] == clue)} videos")
+    # Final output
+    random.seed(seed + 1)  # Deterministic shuffle
+    random.shuffle(all_samples)
+    split_idx = int(len(all_samples) * train_ratio)
+    def write_jsonl(samples, path):
+        with open(path, 'w') as f:
+            for s in samples:
+                json.dump(s, f)
+                f.write('\n')
+    write_jsonl(all_samples[:split_idx], output_dir / "train.jsonl")
+    write_jsonl(all_samples[split_idx:], output_dir / "test.jsonl")
+    # Mark as completed
+    state.clue_progress = clue_progress
+    state.seen_grids = list(seen_grids)
+    state.all_samples = all_samples
+    state.completed = True
+    save_checkpoint(output_dir, state, params)
+    print(f"\n✓ Dataset complete: {output_dir}/")
+    print(f"  Resolution: {actual_size}x{actual_size}")
+    print(f"  Solution range: {sol_range}")
+    print(f"  Puzzles: {len(seen_grids)} ({stats_unique} unique, {stats_multi} multi-sol)")
+    print(f"  Videos: {len(all_samples)}")
+    print(f"  Train: {split_idx}, Test: {len(all_samples) - split_idx}")
+    hammings = [s["min_hamming_to_others"] for s in all_samples if s["min_hamming_to_others"] > 0]
+    if hammings:
+        print(f"  Solution diversity: avg={np.mean(hammings):.1f}, min={min(hammings)}, max={max(hammings)}")
+def parse_resolution(s):
+    w, h = map(int, s.lower().split('x'))
+    return (w, h)
+def parse_args():
+    p = argparse.ArgumentParser(description="Generate Sudoku video dataset with resume support")
+    p.add_argument("--output-dir", type=str, default="sudoku")
+    p.add_argument("--clue-levels", type=int, nargs="+", default=[20,30,40,50,60,70])
+    p.add_argument("--num-per-clue", type=int, nargs="+", default=[15000,10000,10000,5000,2000,1000])
+    p.add_argument("--sol-num", type=str, default="<=3",
+                   help="'1', '3', '>=1', '>1', '<=3', '<3', '2-5'")
+    p.add_argument("--min-hamming", type=int, default=10)
+    p.add_argument("--train-ratio", type=float, default=0.9)
+    p.add_argument("--prompt", type=str, default="Solve this Sudoku puzzle using red font.")
+    p.add_argument("--n-start", type=int, default=2)
+    p.add_argument("--m-end", type=int, default=3)
+    p.add_argument("--k", type=int, default=1)
+    p.add_argument("--max-frames", type=int, default=None)
+    p.add_argument("--fps", type=int, default=10)
+    p.add_argument("--resolution", type=str, default="1024x1024")
+    p.add_argument("--seed", type=int, default=42)
+    p.add_argument("--checkpoint-interval", type=int, default=50,
+                   help="Save checkpoint every N puzzles (default: 50)")
+    return p.parse_args()
+if __name__ == "__main__":
+    args = parse_args()
+    kwargs = vars(args)
+    if isinstance(kwargs["num_per_clue"], list) and len(kwargs["num_per_clue"]) == 1:
+        kwargs["num_per_clue"] = kwargs["num_per_clue"][0]
+    if kwargs["resolution"]:
+        kwargs["resolution"] = parse_resolution(kwargs["resolution"])
+    generate_dataset(**kwargs)

sudoku/jsonl_to_csv.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import json
+import csv
+from pathlib import Path
+dataset='sudoku'
+split='train'
+# Load test data
+with open(f'{dataset}/{split}_info.jsonl', 'r') as f:
+    data = [json.loads(line) for line in f]
+# Write to CSV
+with open(f'{dataset}/{split}.csv', 'w', newline='', encoding='utf-8') as f:
+    writer = csv.writer(f)
+    writer.writerow(['input_image', 'video', 'prompt'])
+    for idx, item in enumerate(data):
+        writer.writerow([
+            'images/' + item['image'],
+            'videos/' + item['video'],
+            item['prompt'],
+        ])

sudoku/simplify_dataset.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import json
+dataset = 'sudoku_600'
+split = 'test'
+# Read original data
+with open(f'{dataset}/{split}.jsonl', 'r') as f:
+    data = [json.loads(line) for line in f]
+# Transform to simplified format
+new_data = [{'prompt': d['prompt'], 'image': d['image']} for d in data]
+# Save simplified data to {split}.jsonl
+with open(f'{dataset}/{split}.jsonl', 'w') as f:
+    f.writelines(json.dumps(item) + '\n' for item in new_data)
+# Save original data to {split}_info.jsonl
+with open(f'{dataset}/{split}_info.jsonl', 'w') as f:
+    f.writelines(json.dumps(item) + '\n' for item in data)

sudoku/sudoku/checkpoints/Wan2.1-I2V-14B-720P_full_0206/epoch-0-diffusers/diffusion_pytorch_model-00001-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a287551bf47373c4e66324c27a84ce2daa48c89acdde4eb8d89178d8ad09da9
+size 4992484608

sudoku/sudoku/checkpoints/Wan2.1-I2V-14B-720P_full_0206/epoch-0-diffusers/diffusion_pytorch_model-00002-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:35a65b38950cf1b3d01460bec6b03e5efdc66854a678f5089a668a2664a91f4c
+size 4898551584

sudoku/sudoku/checkpoints/Wan2.1-I2V-14B-720P_full_0206/epoch-0-diffusers/diffusion_pytorch_model-00003-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:498c9dc1c9d6edabf5514ff30440c752507c07afba637ea0793b611dcd4fd4ea
+size 4987667104

sudoku/sudoku/checkpoints/Wan2.1-I2V-14B-720P_full_0206/epoch-0-diffusers/diffusion_pytorch_model-00004-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e33036a5d81e8a72a5ac48657af77bcd751a7dce06e6e883643f1a7e377e69c6
+size 4987711216

sudoku/sudoku/checkpoints/Wan2.1-I2V-14B-720P_full_0206/epoch-0-diffusers/diffusion_pytorch_model-00005-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9f9f17087d5bd04bb83dfe3958bf03a54def4b6029f9677b19b9c98484a9b742
+size 4950959936

sudoku/sudoku/checkpoints/Wan2.1-I2V-14B-720P_full_0206/epoch-0-diffusers/diffusion_pytorch_model-00006-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:29381ca0d3790260c577b3849efd7ecb87d1550dfc2ca52ae1ab7052c615bf32
+size 4950980632

sudoku/sudoku/checkpoints/Wan2.1-I2V-14B-720P_full_0206/epoch-0-diffusers/diffusion_pytorch_model-00007-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2214191998778c951068e0b15363ddab49e089270b58ef62935aa4a523a5358a
+size 3021537400

sudoku/sudoku/checkpoints/Wan2.1-I2V-14B-720P_full_0206/epoch-0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:504d9c47c4e559e8df2d2b93a69e094ac2b9d1e65f8124849c7e719b47380952
+size 32789894056

sudoku/sudoku/checkpoints/Wan2.2-TI2V-5B_full/epoch-0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ae539b75e0dc5a5809b3b87dd5b02abc1fa9cda2e79b0d9ca63d5954795eaeec
+size 9999659704

sudoku/sudoku/checkpoints/Wan2.2-TI2V-5B_full/epoch-1.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b303e8e24c73ca768be637d2e787f97b95d8c8ae36ae56e8986842549bd69a0b
+size 9999659704

sudoku/sudoku/checkpoints/Wan2.2-TI2V-5B_full/epoch-2.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0c1e05b6a600d53a11306fddc78fce8660cc13716ad330f7c51adb8258118673
+size 9999659704

sudoku/sudoku/checkpoints/Wan2.2-TI2V-5B_full/epoch-3.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b5eae0f23c6c4786bcebd2289bc7e06a912f726c94d2e77f20d0b543630751ca
+size 9999659704

sudoku/sudoku/checkpoints/Wan2.2-TI2V-5B_full/epoch-4.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4d8ab68aeec9fb7b79dac05b95eb4d2d5b9e3cbe20eaeef37adbca4c6c5565fd
+size 9999659704

sudoku/sudoku_600/checkpoints/Wan2.1-I2V-14B-720P_full_1_31/epoch-3.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2f2d8a61d1723198c9cc194edb7e562e5aa84e9e7dd358dcffa492d98b5c8c1c
+size 32789894056

sudoku/sudoku_600/checkpoints/Wan2.1-I2V-14B-720P_full_1_31_diffusers/diffusion_pytorch_model-00001-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e869c2d79321f9c73a4443150d45b2b7455a92c9cb3a3fb1321d698a882fbf3e
+size 4992484608

sudoku/sudoku_600/checkpoints/Wan2.1-I2V-14B-720P_full_1_31_diffusers/diffusion_pytorch_model-00002-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1c24ed79955530349d2f53c16cd038e17281f3728f129b950088812f9d4393a5
+size 4898551584

sudoku/sudoku_600/checkpoints/Wan2.1-I2V-14B-720P_full_1_31_diffusers/diffusion_pytorch_model-00003-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8721792f07a2411d5d07639bac2a52585aeb20727de09a835b0f6251385a4ee7
+size 4987667104

sudoku/sudoku_600/checkpoints/Wan2.1-I2V-14B-720P_full_1_31_diffusers/diffusion_pytorch_model-00004-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:342d017549cd13b866272c0c7ae43f9464ce07936a0c77e3f6c6c36b8a6c8aeb
+size 4987711216

sudoku/sudoku_600/checkpoints/Wan2.1-I2V-14B-720P_full_1_31_diffusers/diffusion_pytorch_model-00005-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1389e9cf029b963914a7c725fb6ab5ced0576b364865e7f9cbb84d4f803b9b74
+size 4950959936

sudoku/sudoku_600/checkpoints/Wan2.1-I2V-14B-720P_full_1_31_diffusers/diffusion_pytorch_model-00006-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be51484e98b724b87e673cc0eeee8aeac66392e838f57879d5fdc34aef9d4022
+size 4950980632

sudoku/sudoku_600/checkpoints/Wan2.1-I2V-14B-720P_full_1_31_diffusers/diffusion_pytorch_model-00007-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ffbdf62dff812b47dbf7e4a96e950bcc396a73922301310b9e84bc008d54e34e
+size 3021537400

sudoku/sudoku_processor.py ADDED Viewed

	@@ -0,0 +1,479 @@

+"""
+SudokuProcessor - Sudoku puzzle generation, solving, and rendering using SAT solver.
+Supports efficient diverse multi-solution generation.
+"""
+import random
+from typing import List, Tuple, Optional
+import numpy as np
+import cv2
+try:
+    from pysat.solvers import Solver
+    HAS_PYSAT = True
+except ImportError:
+    HAS_PYSAT = False
+    print("Warning: pysat not found, install with: pip install python-sat")
+class SudokuProcessor:
+    """Handles Sudoku puzzle generation, solving, and image rendering."""
+    def __init__(self, cell_size: int = 60, font_scale: float = 1.2, thickness: int = 2):
+        self.cell_size = cell_size
+        self.font_scale = font_scale
+        self.thickness = thickness
+        self.img_size = cell_size * 9
+        # Colors (RGB)
+        self.bg_color = (255, 255, 255)
+        self.line_color = (0, 0, 0)
+        self.original_color = (0, 0, 0)
+        self.filled_color = (200, 0, 0)
+        self.highlight_color = (255, 255, 200)
+        self._base_clauses_cache = None
+    # ==================== SAT Encoding ====================
+    def _var(self, r: int, c: int, n: int) -> int:
+        """Map (row, col, num) to SAT variable (1-indexed)."""
+        return r * 81 + c * 9 + n + 1
+    def _decode_var(self, v: int) -> Tuple[int, int, int]:
+        v -= 1
+        return v // 81, (v % 81) // 9, v % 9
+    def _base_clauses(self) -> List[List[int]]:
+        """Generate base Sudoku constraint clauses (cached)."""
+        if self._base_clauses_cache is not None:
+            return self._base_clauses_cache
+        clauses = []
+        for i in range(9):
+            for j in range(9):
+                clauses.append([self._var(i, j, n) for n in range(9)])
+                for n1 in range(9):
+                    for n2 in range(n1 + 1, 9):
+                        clauses.append([-self._var(i, j, n1), -self._var(i, j, n2)])
+        for n in range(9):
+            for i in range(9):
+                clauses.append([self._var(i, j, n) for j in range(9)])
+                for j1 in range(9):
+                    for j2 in range(j1 + 1, 9):
+                        clauses.append([-self._var(i, j1, n), -self._var(i, j2, n)])
+                clauses.append([self._var(j, i, n) for j in range(9)])
+                for j1 in range(9):
+                    for j2 in range(j1 + 1, 9):
+                        clauses.append([-self._var(j1, i, n), -self._var(j2, i, n)])
+            for br in range(3):
+                for bc in range(3):
+                    box = [self._var(br*3+di, bc*3+dj, n) for di in range(3) for dj in range(3)]
+                    clauses.append(box)
+                    for i1 in range(9):
+                        for i2 in range(i1 + 1, 9):
+                            clauses.append([-box[i1], -box[i2]])
+        self._base_clauses_cache = clauses
+        return clauses
+    def _grid_clauses(self, grid: List[List[int]]) -> List[List[int]]:
+        return [[self._var(i, j, grid[i][j] - 1)]
+                for i in range(9) for j in range(9) if grid[i][j] != 0]
+    def _model_to_grid(self, model: List[int]) -> List[List[int]]:
+        grid = [[0] * 9 for _ in range(9)]
+        for v in model:
+            if 0 < v <= 729:
+                r, c, n = self._decode_var(v)
+                grid[r][c] = n + 1
+        return grid
+    # ==================== Solving ====================
+    def solve(self, grid: List[List[int]]) -> Optional[List[List[int]]]:
+        if HAS_PYSAT:
+            with Solver(name='g3') as s:
+                for c in self._base_clauses() + self._grid_clauses(grid):
+                    s.add_clause(c)
+                return self._model_to_grid(s.get_model()) if s.solve() else None
+        return self._solve_backtrack(grid)
+    def _solve_backtrack(self, grid: List[List[int]]) -> Optional[List[List[int]]]:
+        board = [row[:] for row in grid]
+        return board if self._backtrack(board) else None
+    def _backtrack(self, board: List[List[int]]) -> bool:
+        empty = self._find_empty(board)
+        if not empty:
+            return True
+        r, c = empty
+        for num in range(1, 10):
+            if self._is_valid(board, r, c, num):
+                board[r][c] = num
+                if self._backtrack(board):
+                    return True
+                board[r][c] = 0
+        return False
+    def _find_empty(self, board: List[List[int]]) -> Optional[Tuple[int, int]]:
+        for i in range(9):
+            for j in range(9):
+                if board[i][j] == 0:
+                    return (i, j)
+        return None
+    def _is_valid(self, board: List[List[int]], row: int, col: int, num: int) -> bool:
+        if num in board[row]:
+            return False
+        if any(board[i][col] == num for i in range(9)):
+            return False
+        br, bc = 3 * (row // 3), 3 * (col // 3)
+        return all(board[i][j] != num for i in range(br, br+3) for j in range(bc, bc+3))
+    def count_solutions(self, grid: List[List[int]], limit: int = 2) -> int:
+        if HAS_PYSAT:
+            count = 0
+            with Solver(name='g3') as s:
+                for c in self._base_clauses() + self._grid_clauses(grid):
+                    s.add_clause(c)
+                while count < limit and s.solve():
+                    count += 1
+                    s.add_clause([-v for v in s.get_model() if 0 < v <= 729])
+            return count
+        return self._count_backtrack(grid, limit)
+    def _count_backtrack(self, grid: List[List[int]], limit: int) -> int:
+        board = [row[:] for row in grid]
+        self._sol_count, self._sol_limit = 0, limit
+        self._count_helper(board)
+        return self._sol_count
+    def _count_helper(self, board: List[List[int]]) -> bool:
+        if self._sol_count >= self._sol_limit:
+            return True
+        empty = self._find_empty(board)
+        if not empty:
+            self._sol_count += 1
+            return self._sol_count >= self._sol_limit
+        r, c = empty
+        for num in range(1, 10):
+            if self._is_valid(board, r, c, num):
+                board[r][c] = num
+                if self._count_helper(board):
+                    return True
+                board[r][c] = 0
+        return False
+    def find_solutions(self, grid: List[List[int]], limit: int = 10) -> List[List[List[int]]]:
+        if HAS_PYSAT:
+            solutions = []
+            with Solver(name='g3') as s:
+                for c in self._base_clauses() + self._grid_clauses(grid):
+                    s.add_clause(c)
+                while len(solutions) < limit and s.solve():
+                    model = s.get_model()
+                    solutions.append(self._model_to_grid(model))
+                    s.add_clause([-v for v in model if 0 < v <= 729])
+            return solutions
+        return self._find_backtrack(grid, limit)
+    def _find_backtrack(self, grid: List[List[int]], limit: int) -> List[List[List[int]]]:
+        board, solutions = [row[:] for row in grid], []
+        self._find_helper(board, solutions, limit)
+        return solutions
+    def _find_helper(self, board: List[List[int]], solutions: List, limit: int) -> bool:
+        if len(solutions) >= limit:
+            return True
+        empty = self._find_empty(board)
+        if not empty:
+            solutions.append([row[:] for row in board])
+            return len(solutions) >= limit
+        r, c = empty
+        for num in range(1, 10):
+            if self._is_valid(board, r, c, num):
+                board[r][c] = num
+                if self._find_helper(board, solutions, limit):
+                    return True
+                board[r][c] = 0
+        return False
+    # ==================== Generation ====================
+    def generate(self, clues: int = 30, unique: bool = True) -> Tuple[List[List[int]], List[List[int]]]:
+        """Generate a Sudoku puzzle with specified number of clues."""
+        solution = self._generate_full_grid()
+        puzzle = [row[:] for row in solution]
+        cells = [(i, j) for i in range(9) for j in range(9)]
+        random.shuffle(cells)
+        removed, target = 0, 81 - clues
+        for r, c in cells:
+            if removed >= target:
+                break
+            backup = puzzle[r][c]
+            puzzle[r][c] = 0
+            if unique and self.count_solutions(puzzle, 2) != 1:
+                puzzle[r][c] = backup
+            else:
+                removed += 1
+        return puzzle, solution
+    def _generate_full_grid(self) -> List[List[int]]:
+        if HAS_PYSAT:
+            with Solver(name='g3') as s:
+                for c in self._base_clauses():
+                    s.add_clause(c)
+                cells = [(i, j) for i in range(9) for j in range(9)]
+                random.shuffle(cells)
+                assumptions = []
+                for r, c in cells[:11]:
+                    nums = list(range(9))
+                    random.shuffle(nums)
+                    for n in nums:
+                        if s.solve(assumptions=assumptions + [self._var(r, c, n)]):
+                            assumptions.append(self._var(r, c, n))
+                            break
+                s.solve(assumptions=assumptions)
+                return self._model_to_grid(s.get_model())
+        board = [[0] * 9 for _ in range(9)]
+        self._fill_grid(board)
+        return board
+    def _fill_grid(self, board: List[List[int]]) -> bool:
+        empty = self._find_empty(board)
+        if not empty:
+            return True
+        r, c = empty
+        nums = list(range(1, 10))
+        random.shuffle(nums)
+        for num in nums:
+            if self._is_valid(board, r, c, num):
+                board[r][c] = num
+                if self._fill_grid(board):
+                    return True
+                board[r][c] = 0
+        return False
+    # ==================== Diverse Multi-Solution Generation ====================
+    @staticmethod
+    def _hamming(sol1: List[List[int]], sol2: List[List[int]]) -> int:
+        """Count differing cells between two complete grids."""
+        return sum(sol1[i][j] != sol2[i][j] for i in range(9) for j in range(9))
+    @staticmethod
+    def _greedy_diverse_select(
+        candidates: List[List[List[int]]],
+        target_count: int,
+        min_hamming: int,
+        _hamming_fn=None,
+    ) -> List[List[List[int]]]:
+        """
+        Greedily select diverse solutions using farthest-point sampling.
+        1. Start with a random candidate.
+        2. Repeatedly add the candidate with maximum min-distance to the selected set.
+        3. Stop when enough are selected or no candidate meets min_hamming.
+        """
+        if _hamming_fn is None:
+            _hamming_fn = SudokuProcessor._hamming
+        if len(candidates) <= 1:
+            return list(candidates)
+        n = len(candidates)
+        # Pre-compute pairwise distances
+        dist = [[0] * n for _ in range(n)]
+        for i in range(n):
+            for j in range(i + 1, n):
+                d = _hamming_fn(candidates[i], candidates[j])
+                dist[i][j] = d
+                dist[j][i] = d
+        # Farthest-point sampling
+        selected = [random.randint(0, n - 1)]
+        remaining = set(range(n)) - {selected[0]}
+        while len(selected) < target_count and remaining:
+            best_idx = -1
+            best_min_dist = -1
+            for r in remaining:
+                min_d = min(dist[r][s] for s in selected)
+                if min_d > best_min_dist:
+                    best_min_dist = min_d
+                    best_idx = r
+            if best_min_dist < min_hamming:
+                break
+            selected.append(best_idx)
+            remaining.discard(best_idx)
+        return [candidates[i] for i in selected]
+    def generate_multi_solution(
+        self,
+        clues: int,
+        min_solutions: int = 2,
+        max_solutions: int = 5,
+        max_attempts: int = 100,
+        min_hamming: int = 10
+    ) -> Tuple[List[List[int]], List[List[List[int]]]]:
+        """
+        Generate a puzzle with multiple diverse solutions.
+        Puzzle-first strategy:
+          1. Generate a full grid, randomly remove (81-clues) cells WITHOUT
+             uniqueness check → guaranteed to have ≥1 solution, likely many.
+          2. Enumerate candidate solutions of this puzzle via SAT.
+          3. Greedily select diverse solutions (farthest-point sampling).
+          4. If not enough diverse solutions, retry with a new puzzle.
+        This is correct because all returned solutions are guaranteed valid
+        completions of the returned puzzle.
+        Args:
+            clues: Number of given cells.
+            min_solutions: Minimum diverse solutions required.
+            max_solutions: Maximum to return.
+            max_attempts: Outer retry budget.
+            min_hamming: Minimum pairwise Hamming distance.
+        Returns:
+            (puzzle, solutions) — all solutions are valid and pairwise diverse.
+        Raises:
+            RuntimeError: If unable to find a qualifying puzzle.
+        """
+        # Adaptive hamming threshold
+        adaptive_hamming = min_hamming
+        if min_hamming == 10:  # default → auto-adapt
+            if clues >= 55:
+                adaptive_hamming = 3
+            elif clues >= 45:
+                adaptive_hamming = 5
+            elif clues >= 35:
+                adaptive_hamming = 8
+            else:
+                adaptive_hamming = 12
+        # Adaptive search depth: more empty cells → more solutions likely exist
+        empty_cells = 81 - clues
+        if empty_cells <= 15:
+            max_search = 30
+        elif empty_cells <= 25:
+            max_search = 80
+        elif empty_cells <= 40:
+            max_search = 150
+        else:
+            max_search = 300
+        for _ in range(max_attempts):
+            # Phase 1: Generate puzzle (random removal, no uniqueness check)
+            solution = self._generate_full_grid()
+            puzzle = [row[:] for row in solution]
+            cells = [(i, j) for i in range(9) for j in range(9)]
+            random.shuffle(cells)
+            for r, c in cells[:81 - clues]:
+                puzzle[r][c] = 0
+            # # Phase 2: Quick feasibility — need at least min_solutions solutions
+            # quick_count = self.count_solutions(puzzle, min_solutions + 1)
+            # if quick_count < min_solutions:
+            #     continue
+            # Phase 3: Enumerate candidates
+            candidates = self.find_solutions(puzzle, max_search)
+            if len(candidates) < min_solutions:
+                continue
+            # Phase 4: Greedy diverse selection
+            diverse = self._greedy_diverse_select(
+                candidates, max_solutions, adaptive_hamming
+            )
+            if len(diverse) >= min_solutions:
+                return puzzle, diverse[:max_solutions]
+        raise RuntimeError(
+            f"Failed to generate puzzle with {min_solutions}-{max_solutions} "
+            f"diverse solutions (hamming>={adaptive_hamming}) after {max_attempts} attempts"
+        )
+    # ==================== Encoding ====================
+    def encode(self, grid: List[List[int]]) -> str:
+        return ''.join(str(grid[i][j]) for i in range(9) for j in range(9))
+    def decode(self, s: str) -> List[List[int]]:
+        return [[int(s[i * 9 + j]) for j in range(9)] for i in range(9)]
+    # ==================== Rendering ====================
+    def render(
+        self,
+        grid: List[List[int]],
+        highlight_new: Optional[Tuple[int, int]] = None,
+        original: Optional[List[List[int]]] = None
+    ) -> np.ndarray:
+        img = np.full((self.img_size, self.img_size, 3), self.bg_color, dtype=np.uint8)
+        cs = self.cell_size
+        if highlight_new:
+            r, c = highlight_new
+            cv2.rectangle(img, (c * cs, r * cs), ((c+1) * cs, (r+1) * cs), self.highlight_color, -1)
+        for i in range(10):
+            thick = 3 if i % 3 == 0 else 1
+            pos = i * cs
+            cv2.line(img, (pos, 0), (pos, self.img_size), self.line_color, thick)
+            cv2.line(img, (0, pos), (self.img_size, pos), self.line_color, thick)
+        font = cv2.FONT_HERSHEY_SIMPLEX
+        for i in range(9):
+            for j in range(9):
+                if grid[i][j] == 0:
+                    continue
+                is_original = original is None or original[i][j] != 0
+                color = self.original_color if is_original else self.filled_color
+                text = str(grid[i][j])
+                (tw, th), _ = cv2.getTextSize(text, font, self.font_scale, self.thickness)
+                cv2.putText(img, text, (j*cs + (cs-tw)//2, i*cs + (cs+th)//2),
+                           font, self.font_scale, color, self.thickness)
+        return img
+if __name__ == "__main__":
+    proc = SudokuProcessor()
+    print(f"Using {'SAT solver' if HAS_PYSAT else 'backtracking'}...")
+    # Test unique puzzle
+    puzzle, solution = proc.generate(clues=25, unique=True)
+    print("Puzzle:"); [print(row) for row in puzzle]
+    print(f"Clues: {sum(c != 0 for row in puzzle for c in row)}")
+    cv2.imwrite("test_puzzle.png", cv2.cvtColor(proc.render(puzzle), cv2.COLOR_RGB2BGR))
+    cv2.imwrite("test_solution.png", cv2.cvtColor(proc.render(solution, original=puzzle), cv2.COLOR_RGB2BGR))
+    print("Saved test images.")
+    # Test diverse multi-solution at various clue levels
+    print("\n=== Testing diverse multi-solution generation ===")
+    for clues in [25, 35, 45, 55]:
+        print(f"\nClue {clues}:")
+        try:
+            puzzle_m, solutions_m = proc.generate_multi_solution(
+                clues=clues, min_solutions=3, max_solutions=3, min_hamming=10
+            )
+            print(f"  Generated puzzle with {len(solutions_m)} diverse solutions")
+            for i in range(len(solutions_m)):
+                for j in range(i + 1, len(solutions_m)):
+                    print(f"    Hamming(sol{i}, sol{j}) = {proc._hamming(solutions_m[i], solutions_m[j])}")
+        except RuntimeError as e:
+            print(f"  {e}")