Jayce-Ping
/

Visual-Reasoning

Model card Files Files and versions

xet

Community

Jayce-Ping commited on 22 days ago

Commit

460dc79

verified ·

1 Parent(s): 8a42298

Add files using upload-large-folder tool

Browse files

Files changed (2) hide show

ARC/video_evaluate.py +324 -0
ARC/video_generate.py +597 -0

ARC/video_evaluate.py ADDED Viewed

	@@ -0,0 +1,324 @@

+"""ARC-AGI-2 Video Answer Evaluator.
+Extracts the test output grid from the last frame of a generated video,
+then compares it against the ground-truth answer.
+Color recovery pipeline:
+  1. Match pixel RGB against the canonical ARC_COLORS palette → permuted color index
+  2. Apply inverse permutation → original color index
+  3. Compare with ground truth
+Usage:
+    python video_evaluate.py --video_dir videos --data_dir data --output results.json
+"""
+import json
+import random
+import argparse
+from pathlib import Path
+from collections import defaultdict
+import cv2
+import numpy as np
+from tqdm import tqdm
+# ── ARC Color Palette (RGB) ───────────────────────────────────────────────────
+ARC_COLORS = np.array([
+    [0x00, 0x00, 0x00],  # 0: black
+    [0x00, 0x74, 0xD9],  # 1: blue
+    [0xFF, 0x41, 0x36],  # 2: red
+    [0x2E, 0xCC, 0x40],  # 3: green
+    [0xFF, 0xDC, 0x00],  # 4: yellow
+    [0xAA, 0xAA, 0xAA],  # 5: grey
+    [0xF0, 0x12, 0xBE],  # 6: magenta
+    [0xFF, 0x85, 0x1B],  # 7: orange
+    [0x7F, 0xDB, 0xFF],  # 8: light blue
+    [0x87, 0x0C, 0x25],  # 9: maroon
+], dtype=np.uint8)
+# ── Color Permutation Utilities ────────────────────────────────────────────────
+def generate_color_permutation(seed: int) -> list[int]:
+    """Reproduce the same permutation used during video generation."""
+    rng = random.Random(seed)
+    perm = list(range(10))
+    rng.shuffle(perm)
+    return perm
+def invert_permutation(perm: list[int]) -> list[int]:
+    """Compute inverse permutation: inv[perm[i]] = i."""
+    inv = [0] * len(perm)
+    for i, p in enumerate(perm):
+        inv[p] = i
+    return inv
+# ── Layout Computation (mirrors video_generate.py exactly) ─────────────────────
+def compute_test_output_bbox(task: dict, canvas_h: int, canvas_w: int) -> dict:
+    """Compute pixel bounding box of the test output grid region.
+    Replicates _compute_layout + render_frame positioning from video_generate.py.
+    """
+    n_cols = len(task["train"]) + 1
+    n_rows = 2
+    padding = 12
+    outer_margin = 16
+    label_h = 20
+    usable_w = canvas_w - 2 * outer_margin - (n_cols - 1) * padding
+    usable_h = canvas_h - 2 * outer_margin - (n_rows - 1) * padding
+    cell_w = usable_w // n_cols
+    cell_h = usable_h // n_rows
+    total_block_w = cell_w * n_cols + (n_cols - 1) * padding
+    total_block_h = cell_h * n_rows + (n_rows - 1) * padding
+    margin_x = (canvas_w - total_block_w) // 2
+    margin_y = (canvas_h - total_block_h) // 2
+    # Test output: last column, second row
+    col = n_cols - 1
+    x0 = margin_x + col * (cell_w + padding)
+    y0 = margin_y + cell_h + padding
+    test_out = np.array(task["test"][0]["output"])
+    gr, gc = test_out.shape
+    return {
+        "grid_rows": gr,
+        "grid_cols": gc,
+        "grid_x0": x0,
+        "grid_y0": y0 + label_h,
+        "grid_w": cell_w,
+        "grid_h": cell_h - label_h,
+    }
+# ── Frame Extraction ───────────────────────────────────────────────────────────
+def extract_last_frame(video_path: str) -> np.ndarray:
+    """Extract the last frame from a video as an RGB numpy array."""
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise FileNotFoundError(f"Cannot open video: {video_path}")
+    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    cap.set(cv2.CAP_PROP_POS_FRAMES, max(0, total - 1))
+    ret, frame = cap.read()
+    cap.release()
+    if not ret:
+        raise RuntimeError(f"Failed to read last frame from {video_path}")
+    return cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+# ── Grid Extraction ────────────────────────────────────────────────────────────
+def extract_grid_from_frame(
+    frame: np.ndarray,
+    grid_x0: int,
+    grid_y0: int,
+    grid_w: int,
+    grid_h: int,
+    grid_rows: int,
+    grid_cols: int,
+) -> list[list[int]]:
+    """Extract ARC grid by sampling cell centers and matching to ARC_COLORS.
+    Always matches against the canonical ARC_COLORS palette. The returned
+    indices are the permuted color values as rendered in the video.
+    Args:
+        frame: RGB image (H, W, 3).
+        grid_x0, grid_y0: Top-left of grid area (below label).
+        grid_w, grid_h: Grid area dimensions.
+        grid_rows, grid_cols: Expected grid shape.
+    Returns:
+        Grid of permuted color indices (apply inverse perm to get originals).
+    """
+    cell_h = grid_h / grid_rows
+    cell_w = grid_w / grid_cols
+    grid = []
+    for r in range(grid_rows):
+        row = []
+        cy = int(grid_y0 + (r + 0.5) * cell_h)
+        for c in range(grid_cols):
+            cx = int(grid_x0 + (c + 0.5) * cell_w)
+            # 3x3 patch average for codec artifact robustness
+            patch = frame[max(0, cy - 1): cy + 2, max(0, cx - 1): cx + 2]
+            avg = patch.mean(axis=(0, 1)).astype(np.uint8)
+            dists = np.sum((ARC_COLORS.astype(int) - avg.astype(int)) ** 2, axis=1)
+            row.append(int(np.argmin(dists)))
+        grid.append(row)
+    return grid
+# ── Evaluation ─────────────────────────────────────────────────────────────────
+def evaluate_video(
+    video_path: str,
+    task: dict,
+    perm: list[int],
+    canvas_h: int = 720,
+    canvas_w: int = 1280,
+) -> dict:
+    """Evaluate a single video against ground truth.
+    Pipeline:
+      1. Extract last frame (full answer revealed)
+      2. Locate test output region via layout math
+      3. Sample cell centers → match to ARC_COLORS → get permuted color indices
+      4. Apply inverse permutation → recover original color indices
+      5. Compare with ground truth
+    Returns:
+        Dict with 'correct', 'predicted_grid', 'ground_truth', 'pixel_accuracy'.
+    """
+    frame = extract_last_frame(video_path)
+    bbox = compute_test_output_bbox(task, canvas_h, canvas_w)
+    # Step 1: extract permuted color indices from rendered pixels
+    permuted_grid = extract_grid_from_frame(frame, **bbox)
+    # Step 2: invert permutation to recover original values
+    inv = invert_permutation(perm)
+    predicted = [[inv[cell] for cell in row] for row in permuted_grid]
+    # Step 3: compare with ground truth
+    gt = task["test"][0]["output"]
+    correct = (predicted == gt)
+    gt_flat = [c for row in gt for c in row]
+    pred_flat = [c for row in predicted for c in row]
+    n_match = sum(a == b for a, b in zip(gt_flat, pred_flat))
+    pixel_acc = n_match / max(len(gt_flat), 1)
+    return {
+        "correct": correct,
+        "predicted_grid": predicted,
+        "ground_truth": gt,
+        "pixel_accuracy": pixel_acc,
+    }
+# ── Batch Evaluation ───────────────────────────────────────────────────────────
+def evaluate_all(
+    video_dir: str = "videos",
+    data_dir: str = "data",
+    output_file: str = "results.json",
+) -> None:
+    """Evaluate all videos against ground-truth tasks.
+    Recovers the color permutation from the seed in the filename
+    ({task_id}_{seed}.mp4) using the same RNG as video_generate.py.
+    """
+    video_path = Path(video_dir)
+    data_path = Path(data_dir)
+    # Build task file lookup
+    task_files: dict[str, Path] = {}
+    for subdir in ["training", "evaluation"]:
+        d = data_path / subdir
+        if d.exists():
+            for fp in d.glob("*.json"):
+                task_files[fp.stem] = fp
+    videos = sorted(video_path.glob("*.mp4"))
+    if not videos:
+        print(f"No videos found in {video_dir}")
+        return
+    # Auto-detect resolution from first video
+    cap = cv2.VideoCapture(str(videos[0]))
+    canvas_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    canvas_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    cap.release()
+    print(f"Detected resolution: {canvas_h}x{canvas_w}")
+    results = {}
+    total_correct = 0
+    total_count = 0
+    for vp in tqdm(videos, desc="Evaluating"):
+        stem = vp.stem
+        parts = stem.rsplit("_", 1)
+        if len(parts) != 2:
+            continue
+        task_id, seed_str = parts
+        if task_id not in task_files:
+            tqdm.write(f"Skip {stem}: task not found")
+            continue
+        with open(task_files[task_id]) as f:
+            task = json.load(f)
+        if not task.get("test") or "output" not in task["test"][0]:
+            continue
+        # Recover the exact permutation from seed
+        seed = int(seed_str)
+        perm = generate_color_permutation(seed)
+        try:
+            result = evaluate_video(str(vp), task, perm, canvas_h, canvas_w)
+            results[stem] = {
+                "correct": result["correct"],
+                "pixel_accuracy": result["pixel_accuracy"],
+                "task_id": task_id,
+                "seed": seed_str,
+            }
+            total_count += 1
+            if result["correct"]:
+                total_correct += 1
+        except Exception as e:
+            tqdm.write(f"Error {stem}: {e}")
+            results[stem] = {"error": str(e), "task_id": task_id}
+    acc = total_correct / max(total_count, 1)
+    # Per-task pixel accuracy aggregation
+    task_pixels: dict[str, list[float]] = defaultdict(list)
+    for v in results.values():
+        if "pixel_accuracy" in v:
+            task_pixels[v["task_id"]].append(v["pixel_accuracy"])
+    per_task_pixel_acc = {
+        tid: round(sum(accs) / len(accs), 4)
+        for tid, accs in sorted(task_pixels.items())
+    }
+    summary = {
+        "total_videos": total_count,
+        "correct": total_correct,
+        "accuracy": round(acc, 4),
+        "mean_pixel_accuracy": round(
+            sum(per_task_pixel_acc.values()) / max(len(per_task_pixel_acc), 1), 4
+        ),
+        "per_task_pixel_accuracy": per_task_pixel_acc,
+        "results": results,
+    }
+    with open(output_file, "w") as f:
+        json.dump(summary, f, indent=2)
+    print(f"\nResults: {total_correct}/{total_count} correct ({acc:.2%})")
+    print(f"Mean pixel accuracy (per-task avg): {summary['mean_pixel_accuracy']:.2%}")
+    print(f"Saved to {output_file}")
+# ── CLI ────────────────────────────────────────────────────────────────────────
+if __name__ == "__main__":
+    p = argparse.ArgumentParser(description="ARC Video Evaluator")
+    p.add_argument("--video_dir", type=str, default="videos")
+    p.add_argument("--data_dir", type=str, default="data")
+    p.add_argument("--output", type=str, default="results.json")
+    args = p.parse_args()
+    evaluate_all(args.video_dir, args.data_dir, args.output)

ARC/video_generate.py ADDED Viewed

	@@ -0,0 +1,597 @@

+"""ARC-AGI-2 Task Video Generator.
+Generates animated videos for ARC tasks that progressively reveal test outputs.
+Supports random color permutation for data augmentation.
+Renders directly to a target resolution with auto-calculated grid layout.
+Outputs train.jsonl / test.jsonl with stratified splits.
+Usage:
+    python video_generate.py --data_dir data --output_dir videos \
+        --n_frames 5 --m_frames 5 --k_rate 1.0 \
+        --repeat_num 3 --max_frames None --fps 15 \
+        --resolution 720 1280 --train_ratio 0.9
+"""
+import json
+import csv
+import argparse
+import random
+import math
+from pathlib import Path
+from tqdm import tqdm
+import cv2
+import numpy as np
+# ── ARC Color Palette (RGB) ───────────────────────────────────────────────────
+ARC_COLORS = np.array([
+    [0x00, 0x00, 0x00],  # 0: black
+    [0x00, 0x74, 0xD9],  # 1: blue
+    [0xFF, 0x41, 0x36],  # 2: red
+    [0x2E, 0xCC, 0x40],  # 3: green
+    [0xFF, 0xDC, 0x00],  # 4: yellow
+    [0xAA, 0xAA, 0xAA],  # 5: grey
+    [0xF0, 0x12, 0xBE],  # 6: magenta
+    [0xFF, 0x85, 0x1B],  # 7: orange
+    [0x7F, 0xDB, 0xFF],  # 8: light blue
+    [0x87, 0x0C, 0x25],  # 9: maroon
+], dtype=np.uint8)
+GRID_LINE_COLOR = (200, 200, 200)
+LABEL_COLOR = (40, 40, 40)
+BG_COLOR = (255, 255, 255)
+UNREVEALED_COLOR = np.array([220, 220, 220], dtype=np.uint8)
+# ── Color Permutation ──────────────────────────────────────────────────────────
+def generate_color_permutation(seed: int) -> list[int]:
+    """Generate a deterministic color permutation from a seed."""
+    rng = random.Random(seed)
+    perm = list(range(10))
+    rng.shuffle(perm)
+    return perm
+def apply_color_permutation(grid: list[list[int]], perm: list[int]) -> list[list[int]]:
+    """Apply color permutation to a grid (nested list)."""
+    return [[perm[cell] for cell in row] for row in grid]
+def permute_task(task: dict, perm: list[int]) -> dict:
+    """Return a deep-copied task with all grids color-permuted."""
+    new_task = {"train": [], "test": []}
+    for pair in task["train"]:
+        new_task["train"].append({
+            "input": apply_color_permutation(pair["input"], perm),
+            "output": apply_color_permutation(pair["output"], perm),
+        })
+    for pair in task["test"]:
+        new_pair = {"input": apply_color_permutation(pair["input"], perm)}
+        if "output" in pair:
+            new_pair["output"] = apply_color_permutation(pair["output"], perm)
+        new_task["test"].append(new_pair)
+    return new_task
+# ── Direct Canvas Grid Rendering ───────────────────────────────────────────────
+def _render_grid_to_region(
+    canvas: np.ndarray,
+    grid: np.ndarray,
+    x0: int, y0: int, w: int, h: int,
+    label: str,
+    rows_revealed: int | None = None,
+) -> None:
+    """Render a single ARC grid into a rectangular region of the canvas."""
+    label_h = 20
+    grid_y0 = y0 + label_h
+    grid_h = h - label_h
+    grid_w = w
+    if grid_h <= 0 or grid_w <= 0:
+        return
+    gr, gc = grid.shape
+    cell_h = grid_h / gr
+    cell_w = grid_w / gc
+    for r in range(gr):
+        for c in range(gc):
+            cy = int(grid_y0 + r * cell_h)
+            cx = int(x0 + c * cell_w)
+            cy2 = int(grid_y0 + (r + 1) * cell_h)
+            cx2 = int(x0 + (c + 1) * cell_w)
+            if rows_revealed is not None and r >= rows_revealed:
+                color = tuple(UNREVEALED_COLOR.tolist())
+            else:
+                color = tuple(ARC_COLORS[grid[r, c]].tolist())
+            cv2.rectangle(canvas, (cx, cy), (cx2, cy2), color, -1)
+    for r in range(gr + 1):
+        ly = int(grid_y0 + r * cell_h)
+        cv2.line(canvas, (x0, ly), (x0 + grid_w, ly), GRID_LINE_COLOR, 1)
+    for c in range(gc + 1):
+        lx = int(x0 + c * cell_w)
+        cv2.line(canvas, (lx, grid_y0), (lx, grid_y0 + grid_h), GRID_LINE_COLOR, 1)
+    font = cv2.FONT_HERSHEY_SIMPLEX
+    font_scale = 0.8
+    thickness = 1
+    (tw, th), _ = cv2.getTextSize(label, font, font_scale, thickness)
+    tx = x0 + (w - tw) // 2
+    ty = y0 + label_h - 4
+    cv2.putText(canvas, label, (tx, ty), font, font_scale, LABEL_COLOR, thickness, cv2.LINE_AA)
+# ── Layout Calculation ─────────────────────────────────────────────────────────
+def _compute_layout(task: dict, canvas_h: int, canvas_w: int) -> dict:
+    """Compute uniform grid layout for all pairs on the canvas."""
+    n_cols = len(task["train"]) + 1
+    n_rows = 2
+    padding = 12
+    outer_margin = 16
+    label_h = 20
+    usable_w = canvas_w - 2 * outer_margin - (n_cols - 1) * padding
+    usable_h = canvas_h - 2 * outer_margin - (n_rows - 1) * padding
+    cell_w = usable_w // n_cols
+    cell_h = usable_h // n_rows
+    total_block_w = cell_w * n_cols + (n_cols - 1) * padding
+    total_block_h = cell_h * n_rows + (n_rows - 1) * padding
+    margin_x = (canvas_w - total_block_w) // 2
+    margin_y = (canvas_h - total_block_h) // 2
+    return {
+        "n_cols": n_cols, "n_rows": n_rows,
+        "cell_w": cell_w, "cell_h": cell_h,
+        "margin_x": margin_x, "margin_y": margin_y,
+        "padding": padding, "label_h": label_h,
+    }
+# ── Frame Rendering ────────────────────────────────────────────────────────────
+def render_frame(
+    task: dict, test_idx: int, rows_revealed: int | None,
+    canvas_h: int = 720, canvas_w: int = 1280,
+) -> np.ndarray:
+    """Render one video frame as an RGB numpy array."""
+    canvas = np.full((canvas_h, canvas_w, 3), BG_COLOR, dtype=np.uint8)
+    layout = _compute_layout(task, canvas_h, canvas_w)
+    n_cols = layout["n_cols"]
+    cell_w, cell_h = layout["cell_w"], layout["cell_h"]
+    mx, my, pad = layout["margin_x"], layout["margin_y"], layout["padding"]
+    train_pairs = task["train"]
+    test_pair = task["test"][test_idx]
+    for col in range(n_cols):
+        x0 = mx + col * (cell_w + pad)
+        if col < len(train_pairs):
+            inp = np.array(train_pairs[col]["input"])
+            out = np.array(train_pairs[col]["output"])
+            _render_grid_to_region(canvas, inp, x0, my, cell_w, cell_h, f"Train {col+1} In")
+            y1 = my + cell_h + pad
+            _render_grid_to_region(canvas, out, x0, y1, cell_w, cell_h, f"Train {col+1} Out")
+        else:
+            test_in = np.array(test_pair["input"])
+            _render_grid_to_region(canvas, test_in, x0, my, cell_w, cell_h, "Test In")
+            test_out = np.array(test_pair["output"])
+            y1 = my + cell_h + pad
+            reveal = 0 if rows_revealed is None else rows_revealed
+            _render_grid_to_region(canvas, test_out, x0, y1, cell_w, cell_h, "Test Out", rows_revealed=reveal)
+    return canvas
+# ── Video Generation ───────────────────────────────────────────────────────────
+def generate_video(
+    task: dict, output_path: str,
+    n_frames: int = 5, m_frames: int = 5, k_rate: float = 1.0,
+    max_frames: int | None = None, fps: int = 15,
+    canvas_h: int = 720, canvas_w: int = 1280,
+) -> int:
+    """Generate a single ARC task video. Returns total frame count."""
+    test_out = np.array(task["test"][0]["output"])
+    total_rows = test_out.shape[0]
+    reveal_frames_natural = int(math.ceil(total_rows * k_rate))
+    total_natural = n_frames + reveal_frames_natural + m_frames
+    if max_frames is not None and total_natural > max_frames:
+        available_reveal = max(1, max_frames - n_frames - m_frames)
+        effective_k = available_reveal / total_rows
+        reveal_frames = available_reveal
+    else:
+        effective_k = k_rate
+        reveal_frames = reveal_frames_natural
+    total_frames = n_frames + reveal_frames + m_frames
+    h = canvas_h if canvas_h % 2 == 0 else canvas_h + 1
+    w = canvas_w if canvas_w % 2 == 0 else canvas_w + 1
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+    writer = cv2.VideoWriter(output_path, fourcc, fps, (w, h))
+    def _write(frame_rgb: np.ndarray) -> None:
+        writer.write(cv2.cvtColor(frame_rgb, cv2.COLOR_RGB2BGR))
+    pbar = tqdm(total=total_frames, desc="  Frames", leave=False, unit="f")
+    # Phase 1: Placeholder
+    placeholder = render_frame(task, 0, None, h, w)
+    for _ in range(n_frames):
+        _write(placeholder)
+        pbar.update(1)
+    # Phase 2: Progressive reveal
+    if effective_k >= 1:
+        frames_per_row = effective_k
+        row_cursor = 0
+        accumulated = 0.0
+        for _ in range(reveal_frames):
+            accumulated += 1.0
+            if accumulated >= frames_per_row and row_cursor < total_rows:
+                row_cursor += 1
+                accumulated -= frames_per_row
+            _write(render_frame(task, 0, row_cursor, h, w))
+            pbar.update(1)
+    else:
+        rows_per_frame = 1.0 / effective_k
+        row_accum = 0.0
+        for _ in range(reveal_frames):
+            row_accum += rows_per_frame
+            rows_shown = min(int(math.ceil(row_accum)), total_rows)
+            _write(render_frame(task, 0, rows_shown, h, w))
+            pbar.update(1)
+    # Phase 3: Full answer
+    full = render_frame(task, 0, total_rows, h, w)
+    for _ in range(m_frames):
+        _write(full)
+        pbar.update(1)
+    pbar.close()
+    writer.release()
+    return total_frames
+# ── Metadata Cache ──────────────────────���──────────────────────────────────────
+METADATA_FILE = ".metadata.json"
+def _build_params_dict(
+    data_dir: str, n_frames: int, m_frames: int, k_rate: float,
+    max_frames: int | None, fps: int, repeat_num: int,
+    canvas_h: int, canvas_w: int,
+) -> dict:
+    """Build a JSON-serializable dict of generation parameters."""
+    return {
+        "data_dir": str(Path(data_dir).resolve()),
+        "n_frames": n_frames, "m_frames": m_frames,
+        "k_rate": k_rate, "max_frames": max_frames,
+        "fps": fps, "repeat_num": repeat_num,
+        "canvas_h": canvas_h, "canvas_w": canvas_w,
+    }
+def _load_metadata(out_path: Path) -> dict | None:
+    meta_path = out_path / METADATA_FILE
+    if not meta_path.exists():
+        return None
+    try:
+        with open(meta_path) as f:
+            return json.load(f)
+    except (json.JSONDecodeError, OSError):
+        return None
+def _save_metadata(out_path: Path, params: dict, completed: set[str]) -> None:
+    meta = {"params": params, "completed": sorted(completed)}
+    tmp_path = (out_path / METADATA_FILE).with_suffix(".tmp")
+    with open(tmp_path, "w") as f:
+        json.dump(meta, f, indent=2)
+    tmp_path.replace(out_path / METADATA_FILE)
+def _clear_output_dir(out_path: Path) -> None:
+    if out_path.exists():
+        for mp4 in out_path.glob("*.mp4"):
+            mp4.unlink()
+        meta = out_path / METADATA_FILE
+        if meta.exists():
+            meta.unlink()
+def compute_test_output_bbox(task: dict, canvas_h: int, canvas_w: int) -> dict:
+    """Compute the pixel bounding box of the test output cell."""
+    n_cols = len(task["train"]) + 1
+    n_rows = 2
+    padding = 12
+    outer_margin = 16
+    label_h = 20
+    usable_w = canvas_w - 2 * outer_margin - (n_cols - 1) * padding
+    usable_h = canvas_h - 2 * outer_margin - (n_rows - 1) * padding
+    cell_w = usable_w // n_cols
+    cell_h = usable_h // n_rows
+    total_block_w = cell_w * n_cols + (n_cols - 1) * padding
+    total_block_h = cell_h * n_rows + (n_rows - 1) * padding
+    margin_x = (canvas_w - total_block_w) // 2
+    margin_y = (canvas_h - total_block_h) // 2
+    col = n_cols - 1
+    x0 = margin_x + col * (cell_w + padding)
+    y0 = margin_y + cell_h + padding
+    test_out = np.array(task["test"][0]["output"])
+    gr, gc = test_out.shape
+    return {
+        "grid_rows": gr, "grid_cols": gc,
+        "x0": x0, "y0": y0,
+        "grid_x0": x0, "grid_y0": y0 + label_h,
+        "grid_w": cell_w, "grid_h": cell_h - label_h,
+        "cell_w": cell_w, "cell_h": cell_h,
+    }
+def save_video_metadata(
+    task: dict, perm: list[int], seed: int,
+    canvas_h: int, canvas_w: int, meta_path: str,
+) -> None:
+    """Save per-video metadata JSON for evaluation."""
+    bbox = compute_test_output_bbox(task, canvas_h, canvas_w)
+    permuted_palette = ARC_COLORS[perm].tolist()
+    meta = {
+        "seed": seed,
+        "color_perm": perm,
+        "permuted_palette": permuted_palette,
+        "canvas_h": canvas_h,
+        "canvas_w": canvas_w,
+        **bbox,
+    }
+    Path(meta_path).parent.mkdir(parents=True, exist_ok=True)
+    with open(meta_path, "w") as f:
+        json.dump(meta, f, indent=2)
+# ── Train/Test Split ───────────────────────────────────────────────────────────
+def _write_splits(
+    all_samples: list[dict],
+    out_path: Path,
+    train_ratio: float,
+) -> None:
+    """Stratified train/test split by source, write JSONL and CSV files."""
+    rng = random.Random(42)
+    by_source: dict[str, list[dict]] = {}
+    for s in all_samples:
+        by_source.setdefault(s["source"], []).append(s)
+    train_samples, test_samples = [], []
+    for source in sorted(by_source):
+        group = by_source[source]
+        rng.shuffle(group)
+        split_idx = int(len(group) * train_ratio)
+        train_samples.extend(group[:split_idx])
+        test_samples.extend(group[split_idx:])
+    rng.shuffle(train_samples)
+    rng.shuffle(test_samples)
+    # JSONL
+    for name, samples in [("train", train_samples), ("test", test_samples)]:
+        with open(out_path / f"{name}.jsonl", "w") as f:
+            for s in samples:
+                f.write(json.dumps(s) + "\n")
+    # CSV
+    for name, samples in [("train", train_samples), ("test", test_samples)]:
+        with open(out_path / f"{name}.csv", "w", newline="", encoding="utf-8") as f:
+            writer = csv.writer(f)
+            writer.writerow(["video", "meta", "task_id", "source", "prompt"])
+            for s in samples:
+                writer.writerow([s["video"], s["meta"], s["task_id"], s["source"], s["prompt"]])
+    tqdm.write(f"  Split: {len(train_samples)} train / {len(test_samples)} test")
+    tqdm.write(f"  Written: train.jsonl, test.jsonl, train.csv, test.csv")
+# ── Batch Processing ───────────────────────────────────────────────────────────
+def process_all(
+    data_dir: str = "data",
+    output_dir: str = "videos",
+    n_frames: int = 5,
+    m_frames: int = 5,
+    k_rate: float = 1.0,
+    max_frames: int | None = None,
+    fps: int = 15,
+    repeat_num: int = 3,
+    canvas_h: int = 720,
+    canvas_w: int = 1280,
+    train_ratio: float = 0.9,
+    prompt: str = "Predict the test output grid based on the input-output training examples.",
+) -> None:
+    """Generate videos for all ARC tasks with train/test JSONL splits.
+    Supports resumption via metadata cache. After generation, writes
+    stratified train.jsonl / test.jsonl / CSV files.
+    """
+    data_path = Path(data_dir)
+    out_path = Path(output_dir)
+    out_path.mkdir(parents=True, exist_ok=True)
+    current_params = _build_params_dict(
+        data_dir, n_frames, m_frames, k_rate, max_frames, fps, repeat_num,
+        canvas_h, canvas_w,
+    )
+    existing_meta = _load_metadata(out_path)
+    if existing_meta is not None and existing_meta.get("params") == current_params:
+        completed: set[str] = {
+            name for name in existing_meta.get("completed", [])
+            if (out_path / name).exists()
+        }
+        tqdm.write(f"Resuming: {len(completed)} videos already completed.")
+    else:
+        if existing_meta is not None:
+            tqdm.write("Parameters changed — clearing and restarting.")
+            _clear_output_dir(out_path)
+        completed = set()
+    _save_metadata(out_path, current_params, completed)
+    task_files = sorted(
+        list((data_path / "training").glob("*.json"))
+        + list((data_path / "evaluation").glob("*.json"))
+    )
+    if not task_files:
+        print(f"No task files found in {data_path}/training or {data_path}/evaluation")
+        return
+    total = len(task_files) * repeat_num
+    pbar = tqdm(total=total, desc="Tasks", unit="vid", initial=len(completed))
+    save_every = 20
+    new_since_save = 0
+    all_samples: list[dict] = []
+    for fpath in task_files:
+        task_id = fpath.stem
+        source = fpath.parent.name  # "training" or "evaluation"
+        with open(fpath) as f:
+            task_raw = json.load(f)
+        if not task_raw.get("test") or "output" not in task_raw["test"][0]:
+            pbar.update(repeat_num)
+            continue
+        test_out_arr = np.array(task_raw["test"][0]["output"])
+        grid_rows, grid_cols = test_out_arr.shape
+        used_perms: set[tuple[int, ...]] = set()
+        seed = 0
+        generated = 0
+        while generated < repeat_num:
+            perm = generate_color_permutation(seed)
+            perm_key = tuple(perm)
+            if perm_key not in used_perms:
+                used_perms.add(perm_key)
+                video_name = f"{task_id}_{seed}.mp4"
+                meta_name = f"{task_id}_{seed}.meta.json"
+                sample_meta = {
+                    "task_id": task_id,
+                    "source": source,
+                    "seed": seed,
+                    "video": video_name,
+                    "meta": meta_name,
+                    "prompt": prompt,
+                    "grid_rows": int(grid_rows),
+                    "grid_cols": int(grid_cols),
+                    "color_perm": perm,
+                    "n_train_pairs": len(task_raw["train"]),
+                }
+                if video_name not in completed:
+                    permuted_task = permute_task(task_raw, perm)
+                    pbar.set_postfix_str(f"{task_id}_{seed}")
+                    video_file = str(out_path / video_name)
+                    frame_count = generate_video(
+                        permuted_task, video_file,
+                        n_frames=n_frames, m_frames=m_frames, k_rate=k_rate,
+                        max_frames=max_frames, fps=fps,
+                        canvas_h=canvas_h, canvas_w=canvas_w,
+                    )
+                    sample_meta["frame_count"] = frame_count
+                    meta_file = video_file.replace(".mp4", ".meta.json")
+                    save_video_metadata(
+                        task=permuted_task, perm=perm, seed=seed,
+                        canvas_h=canvas_h, canvas_w=canvas_w, meta_path=meta_file,
+                    )
+                    completed.add(video_name)
+                    pbar.update(1)
+                    new_since_save += 1
+                    if new_since_save >= save_every:
+                        _save_metadata(out_path, current_params, completed)
+                        new_since_save = 0
+                all_samples.append(sample_meta)
+                generated += 1
+            seed += 1
+            if seed > repeat_num + 1000:
+                tqdm.write(f"Warning: could not generate {repeat_num} unique perms for {task_id}")
+                pbar.update(repeat_num - generated)
+                break
+    pbar.close()
+    _save_metadata(out_path, current_params, completed)
+    # Write train/test splits
+    _write_splits(all_samples, out_path, train_ratio)
+    tqdm.write(f"Done. {len(completed)} videos, {len(all_samples)} samples in {out_path}/")
+# ── CLI ────────────────────────────────────────────────────────────────────────
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(description="ARC-AGI-2 Video Generator")
+    p.add_argument("--data_dir", type=str, default="ARC-AGI-2/data")
+    p.add_argument("--output_dir", type=str, default="videos")
+    p.add_argument("--n_frames", type=int, default=5)
+    p.add_argument("--m_frames", type=int, default=5)
+    p.add_argument("--k_rate", type=float, default=1.0)
+    p.add_argument("--max_frames", type=int, default=None)
+    p.add_argument("--fps", type=int, default=15)
+    p.add_argument("--repeat_num", type=int, default=3)
+    p.add_argument("--resolution", type=int, nargs=2, default=[720, 1280],
+                   metavar=("H", "W"))
+    p.add_argument("--train_ratio", type=float, default=0.9,
+                   help="Train split ratio (default: 0.9)")
+    p.add_argument("--prompt", type=str,
+                   default="Predict the test output grid based on the input-output training examples.")
+    return p.parse_args()
+if __name__ == "__main__":
+    args = parse_args()
+    process_all(
+        data_dir=args.data_dir,
+        output_dir=args.output_dir,
+        n_frames=args.n_frames,
+        m_frames=args.m_frames,
+        k_rate=args.k_rate,
+        max_frames=args.max_frames,
+        fps=args.fps,
+        repeat_num=args.repeat_num,
+        canvas_h=args.resolution[0],
+        canvas_w=args.resolution[1],
+        train_ratio=args.train_ratio,
+        prompt=args.prompt,
+    )