Jayce-Ping commited on
Commit
460dc79
Β·
verified Β·
1 Parent(s): 8a42298

Add files using upload-large-folder tool

Browse files
Files changed (2) hide show
  1. ARC/video_evaluate.py +324 -0
  2. ARC/video_generate.py +597 -0
ARC/video_evaluate.py ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ARC-AGI-2 Video Answer Evaluator.
2
+
3
+ Extracts the test output grid from the last frame of a generated video,
4
+ then compares it against the ground-truth answer.
5
+
6
+ Color recovery pipeline:
7
+ 1. Match pixel RGB against the canonical ARC_COLORS palette β†’ permuted color index
8
+ 2. Apply inverse permutation β†’ original color index
9
+ 3. Compare with ground truth
10
+
11
+ Usage:
12
+ python video_evaluate.py --video_dir videos --data_dir data --output results.json
13
+ """
14
+
15
+ import json
16
+ import random
17
+ import argparse
18
+ from pathlib import Path
19
+
20
+ from collections import defaultdict
21
+ import cv2
22
+ import numpy as np
23
+ from tqdm import tqdm
24
+
25
+ # ── ARC Color Palette (RGB) ───────────────────────────────────────────────────
26
+
27
+ ARC_COLORS = np.array([
28
+ [0x00, 0x00, 0x00], # 0: black
29
+ [0x00, 0x74, 0xD9], # 1: blue
30
+ [0xFF, 0x41, 0x36], # 2: red
31
+ [0x2E, 0xCC, 0x40], # 3: green
32
+ [0xFF, 0xDC, 0x00], # 4: yellow
33
+ [0xAA, 0xAA, 0xAA], # 5: grey
34
+ [0xF0, 0x12, 0xBE], # 6: magenta
35
+ [0xFF, 0x85, 0x1B], # 7: orange
36
+ [0x7F, 0xDB, 0xFF], # 8: light blue
37
+ [0x87, 0x0C, 0x25], # 9: maroon
38
+ ], dtype=np.uint8)
39
+
40
+
41
+ # ── Color Permutation Utilities ────────────────────────────────────────────────
42
+
43
+ def generate_color_permutation(seed: int) -> list[int]:
44
+ """Reproduce the same permutation used during video generation."""
45
+ rng = random.Random(seed)
46
+ perm = list(range(10))
47
+ rng.shuffle(perm)
48
+ return perm
49
+
50
+
51
+ def invert_permutation(perm: list[int]) -> list[int]:
52
+ """Compute inverse permutation: inv[perm[i]] = i."""
53
+ inv = [0] * len(perm)
54
+ for i, p in enumerate(perm):
55
+ inv[p] = i
56
+ return inv
57
+
58
+
59
+ # ── Layout Computation (mirrors video_generate.py exactly) ─────────────────────
60
+
61
+ def compute_test_output_bbox(task: dict, canvas_h: int, canvas_w: int) -> dict:
62
+ """Compute pixel bounding box of the test output grid region.
63
+
64
+ Replicates _compute_layout + render_frame positioning from video_generate.py.
65
+ """
66
+ n_cols = len(task["train"]) + 1
67
+ n_rows = 2
68
+ padding = 12
69
+ outer_margin = 16
70
+ label_h = 20
71
+
72
+ usable_w = canvas_w - 2 * outer_margin - (n_cols - 1) * padding
73
+ usable_h = canvas_h - 2 * outer_margin - (n_rows - 1) * padding
74
+ cell_w = usable_w // n_cols
75
+ cell_h = usable_h // n_rows
76
+
77
+ total_block_w = cell_w * n_cols + (n_cols - 1) * padding
78
+ total_block_h = cell_h * n_rows + (n_rows - 1) * padding
79
+ margin_x = (canvas_w - total_block_w) // 2
80
+ margin_y = (canvas_h - total_block_h) // 2
81
+
82
+ # Test output: last column, second row
83
+ col = n_cols - 1
84
+ x0 = margin_x + col * (cell_w + padding)
85
+ y0 = margin_y + cell_h + padding
86
+
87
+ test_out = np.array(task["test"][0]["output"])
88
+ gr, gc = test_out.shape
89
+
90
+ return {
91
+ "grid_rows": gr,
92
+ "grid_cols": gc,
93
+ "grid_x0": x0,
94
+ "grid_y0": y0 + label_h,
95
+ "grid_w": cell_w,
96
+ "grid_h": cell_h - label_h,
97
+ }
98
+
99
+
100
+ # ── Frame Extraction ───────────────────────────────────────────────────────────
101
+
102
+ def extract_last_frame(video_path: str) -> np.ndarray:
103
+ """Extract the last frame from a video as an RGB numpy array."""
104
+ cap = cv2.VideoCapture(video_path)
105
+ if not cap.isOpened():
106
+ raise FileNotFoundError(f"Cannot open video: {video_path}")
107
+
108
+ total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
109
+ cap.set(cv2.CAP_PROP_POS_FRAMES, max(0, total - 1))
110
+ ret, frame = cap.read()
111
+ cap.release()
112
+
113
+ if not ret:
114
+ raise RuntimeError(f"Failed to read last frame from {video_path}")
115
+ return cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
116
+
117
+
118
+ # ── Grid Extraction ────────────────────────────────────────────────────────────
119
+
120
+ def extract_grid_from_frame(
121
+ frame: np.ndarray,
122
+ grid_x0: int,
123
+ grid_y0: int,
124
+ grid_w: int,
125
+ grid_h: int,
126
+ grid_rows: int,
127
+ grid_cols: int,
128
+ ) -> list[list[int]]:
129
+ """Extract ARC grid by sampling cell centers and matching to ARC_COLORS.
130
+
131
+ Always matches against the canonical ARC_COLORS palette. The returned
132
+ indices are the permuted color values as rendered in the video.
133
+
134
+ Args:
135
+ frame: RGB image (H, W, 3).
136
+ grid_x0, grid_y0: Top-left of grid area (below label).
137
+ grid_w, grid_h: Grid area dimensions.
138
+ grid_rows, grid_cols: Expected grid shape.
139
+
140
+ Returns:
141
+ Grid of permuted color indices (apply inverse perm to get originals).
142
+ """
143
+ cell_h = grid_h / grid_rows
144
+ cell_w = grid_w / grid_cols
145
+
146
+ grid = []
147
+ for r in range(grid_rows):
148
+ row = []
149
+ cy = int(grid_y0 + (r + 0.5) * cell_h)
150
+ for c in range(grid_cols):
151
+ cx = int(grid_x0 + (c + 0.5) * cell_w)
152
+ # 3x3 patch average for codec artifact robustness
153
+ patch = frame[max(0, cy - 1): cy + 2, max(0, cx - 1): cx + 2]
154
+ avg = patch.mean(axis=(0, 1)).astype(np.uint8)
155
+ dists = np.sum((ARC_COLORS.astype(int) - avg.astype(int)) ** 2, axis=1)
156
+ row.append(int(np.argmin(dists)))
157
+ grid.append(row)
158
+ return grid
159
+
160
+
161
+ # ── Evaluation ─────────────────────────────────────────────────────────────────
162
+
163
+ def evaluate_video(
164
+ video_path: str,
165
+ task: dict,
166
+ perm: list[int],
167
+ canvas_h: int = 720,
168
+ canvas_w: int = 1280,
169
+ ) -> dict:
170
+ """Evaluate a single video against ground truth.
171
+
172
+ Pipeline:
173
+ 1. Extract last frame (full answer revealed)
174
+ 2. Locate test output region via layout math
175
+ 3. Sample cell centers β†’ match to ARC_COLORS β†’ get permuted color indices
176
+ 4. Apply inverse permutation β†’ recover original color indices
177
+ 5. Compare with ground truth
178
+
179
+ Returns:
180
+ Dict with 'correct', 'predicted_grid', 'ground_truth', 'pixel_accuracy'.
181
+ """
182
+ frame = extract_last_frame(video_path)
183
+ bbox = compute_test_output_bbox(task, canvas_h, canvas_w)
184
+
185
+ # Step 1: extract permuted color indices from rendered pixels
186
+ permuted_grid = extract_grid_from_frame(frame, **bbox)
187
+
188
+ # Step 2: invert permutation to recover original values
189
+ inv = invert_permutation(perm)
190
+ predicted = [[inv[cell] for cell in row] for row in permuted_grid]
191
+
192
+ # Step 3: compare with ground truth
193
+ gt = task["test"][0]["output"]
194
+ correct = (predicted == gt)
195
+
196
+ gt_flat = [c for row in gt for c in row]
197
+ pred_flat = [c for row in predicted for c in row]
198
+ n_match = sum(a == b for a, b in zip(gt_flat, pred_flat))
199
+ pixel_acc = n_match / max(len(gt_flat), 1)
200
+
201
+ return {
202
+ "correct": correct,
203
+ "predicted_grid": predicted,
204
+ "ground_truth": gt,
205
+ "pixel_accuracy": pixel_acc,
206
+ }
207
+
208
+
209
+ # ── Batch Evaluation ───────────────────────────────────────────────────────────
210
+
211
+ def evaluate_all(
212
+ video_dir: str = "videos",
213
+ data_dir: str = "data",
214
+ output_file: str = "results.json",
215
+ ) -> None:
216
+ """Evaluate all videos against ground-truth tasks.
217
+
218
+ Recovers the color permutation from the seed in the filename
219
+ ({task_id}_{seed}.mp4) using the same RNG as video_generate.py.
220
+ """
221
+ video_path = Path(video_dir)
222
+ data_path = Path(data_dir)
223
+
224
+ # Build task file lookup
225
+ task_files: dict[str, Path] = {}
226
+ for subdir in ["training", "evaluation"]:
227
+ d = data_path / subdir
228
+ if d.exists():
229
+ for fp in d.glob("*.json"):
230
+ task_files[fp.stem] = fp
231
+
232
+ videos = sorted(video_path.glob("*.mp4"))
233
+ if not videos:
234
+ print(f"No videos found in {video_dir}")
235
+ return
236
+
237
+ # Auto-detect resolution from first video
238
+ cap = cv2.VideoCapture(str(videos[0]))
239
+ canvas_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
240
+ canvas_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
241
+ cap.release()
242
+ print(f"Detected resolution: {canvas_h}x{canvas_w}")
243
+
244
+ results = {}
245
+ total_correct = 0
246
+ total_count = 0
247
+
248
+ for vp in tqdm(videos, desc="Evaluating"):
249
+ stem = vp.stem
250
+ parts = stem.rsplit("_", 1)
251
+ if len(parts) != 2:
252
+ continue
253
+ task_id, seed_str = parts
254
+
255
+ if task_id not in task_files:
256
+ tqdm.write(f"Skip {stem}: task not found")
257
+ continue
258
+
259
+ with open(task_files[task_id]) as f:
260
+ task = json.load(f)
261
+
262
+ if not task.get("test") or "output" not in task["test"][0]:
263
+ continue
264
+
265
+ # Recover the exact permutation from seed
266
+ seed = int(seed_str)
267
+ perm = generate_color_permutation(seed)
268
+
269
+ try:
270
+ result = evaluate_video(str(vp), task, perm, canvas_h, canvas_w)
271
+ results[stem] = {
272
+ "correct": result["correct"],
273
+ "pixel_accuracy": result["pixel_accuracy"],
274
+ "task_id": task_id,
275
+ "seed": seed_str,
276
+ }
277
+ total_count += 1
278
+ if result["correct"]:
279
+ total_correct += 1
280
+ except Exception as e:
281
+ tqdm.write(f"Error {stem}: {e}")
282
+ results[stem] = {"error": str(e), "task_id": task_id}
283
+
284
+ acc = total_correct / max(total_count, 1)
285
+
286
+ # Per-task pixel accuracy aggregation
287
+ task_pixels: dict[str, list[float]] = defaultdict(list)
288
+ for v in results.values():
289
+ if "pixel_accuracy" in v:
290
+ task_pixels[v["task_id"]].append(v["pixel_accuracy"])
291
+
292
+ per_task_pixel_acc = {
293
+ tid: round(sum(accs) / len(accs), 4)
294
+ for tid, accs in sorted(task_pixels.items())
295
+ }
296
+
297
+ summary = {
298
+ "total_videos": total_count,
299
+ "correct": total_correct,
300
+ "accuracy": round(acc, 4),
301
+ "mean_pixel_accuracy": round(
302
+ sum(per_task_pixel_acc.values()) / max(len(per_task_pixel_acc), 1), 4
303
+ ),
304
+ "per_task_pixel_accuracy": per_task_pixel_acc,
305
+ "results": results,
306
+ }
307
+
308
+ with open(output_file, "w") as f:
309
+ json.dump(summary, f, indent=2)
310
+
311
+ print(f"\nResults: {total_correct}/{total_count} correct ({acc:.2%})")
312
+ print(f"Mean pixel accuracy (per-task avg): {summary['mean_pixel_accuracy']:.2%}")
313
+ print(f"Saved to {output_file}")
314
+
315
+
316
+ # ── CLI ────────────────────────────────────────────────────────────────────────
317
+
318
+ if __name__ == "__main__":
319
+ p = argparse.ArgumentParser(description="ARC Video Evaluator")
320
+ p.add_argument("--video_dir", type=str, default="videos")
321
+ p.add_argument("--data_dir", type=str, default="data")
322
+ p.add_argument("--output", type=str, default="results.json")
323
+ args = p.parse_args()
324
+ evaluate_all(args.video_dir, args.data_dir, args.output)
ARC/video_generate.py ADDED
@@ -0,0 +1,597 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ARC-AGI-2 Task Video Generator.
2
+
3
+ Generates animated videos for ARC tasks that progressively reveal test outputs.
4
+ Supports random color permutation for data augmentation.
5
+ Renders directly to a target resolution with auto-calculated grid layout.
6
+ Outputs train.jsonl / test.jsonl with stratified splits.
7
+
8
+ Usage:
9
+ python video_generate.py --data_dir data --output_dir videos \
10
+ --n_frames 5 --m_frames 5 --k_rate 1.0 \
11
+ --repeat_num 3 --max_frames None --fps 15 \
12
+ --resolution 720 1280 --train_ratio 0.9
13
+ """
14
+
15
+ import json
16
+ import csv
17
+ import argparse
18
+ import random
19
+ import math
20
+ from pathlib import Path
21
+
22
+ from tqdm import tqdm
23
+
24
+ import cv2
25
+ import numpy as np
26
+
27
+ # ── ARC Color Palette (RGB) ───────────────────────────────────────────────────
28
+
29
+ ARC_COLORS = np.array([
30
+ [0x00, 0x00, 0x00], # 0: black
31
+ [0x00, 0x74, 0xD9], # 1: blue
32
+ [0xFF, 0x41, 0x36], # 2: red
33
+ [0x2E, 0xCC, 0x40], # 3: green
34
+ [0xFF, 0xDC, 0x00], # 4: yellow
35
+ [0xAA, 0xAA, 0xAA], # 5: grey
36
+ [0xF0, 0x12, 0xBE], # 6: magenta
37
+ [0xFF, 0x85, 0x1B], # 7: orange
38
+ [0x7F, 0xDB, 0xFF], # 8: light blue
39
+ [0x87, 0x0C, 0x25], # 9: maroon
40
+ ], dtype=np.uint8)
41
+
42
+ GRID_LINE_COLOR = (200, 200, 200)
43
+ LABEL_COLOR = (40, 40, 40)
44
+ BG_COLOR = (255, 255, 255)
45
+ UNREVEALED_COLOR = np.array([220, 220, 220], dtype=np.uint8)
46
+
47
+
48
+ # ── Color Permutation ──────────────────────────────────────────────────────────
49
+
50
+ def generate_color_permutation(seed: int) -> list[int]:
51
+ """Generate a deterministic color permutation from a seed."""
52
+ rng = random.Random(seed)
53
+ perm = list(range(10))
54
+ rng.shuffle(perm)
55
+ return perm
56
+
57
+
58
+ def apply_color_permutation(grid: list[list[int]], perm: list[int]) -> list[list[int]]:
59
+ """Apply color permutation to a grid (nested list)."""
60
+ return [[perm[cell] for cell in row] for row in grid]
61
+
62
+
63
+ def permute_task(task: dict, perm: list[int]) -> dict:
64
+ """Return a deep-copied task with all grids color-permuted."""
65
+ new_task = {"train": [], "test": []}
66
+ for pair in task["train"]:
67
+ new_task["train"].append({
68
+ "input": apply_color_permutation(pair["input"], perm),
69
+ "output": apply_color_permutation(pair["output"], perm),
70
+ })
71
+ for pair in task["test"]:
72
+ new_pair = {"input": apply_color_permutation(pair["input"], perm)}
73
+ if "output" in pair:
74
+ new_pair["output"] = apply_color_permutation(pair["output"], perm)
75
+ new_task["test"].append(new_pair)
76
+ return new_task
77
+
78
+
79
+ # ── Direct Canvas Grid Rendering ───────────────────────────────────────────────
80
+
81
+ def _render_grid_to_region(
82
+ canvas: np.ndarray,
83
+ grid: np.ndarray,
84
+ x0: int, y0: int, w: int, h: int,
85
+ label: str,
86
+ rows_revealed: int | None = None,
87
+ ) -> None:
88
+ """Render a single ARC grid into a rectangular region of the canvas."""
89
+ label_h = 20
90
+ grid_y0 = y0 + label_h
91
+ grid_h = h - label_h
92
+ grid_w = w
93
+
94
+ if grid_h <= 0 or grid_w <= 0:
95
+ return
96
+
97
+ gr, gc = grid.shape
98
+ cell_h = grid_h / gr
99
+ cell_w = grid_w / gc
100
+
101
+ for r in range(gr):
102
+ for c in range(gc):
103
+ cy = int(grid_y0 + r * cell_h)
104
+ cx = int(x0 + c * cell_w)
105
+ cy2 = int(grid_y0 + (r + 1) * cell_h)
106
+ cx2 = int(x0 + (c + 1) * cell_w)
107
+
108
+ if rows_revealed is not None and r >= rows_revealed:
109
+ color = tuple(UNREVEALED_COLOR.tolist())
110
+ else:
111
+ color = tuple(ARC_COLORS[grid[r, c]].tolist())
112
+
113
+ cv2.rectangle(canvas, (cx, cy), (cx2, cy2), color, -1)
114
+
115
+ for r in range(gr + 1):
116
+ ly = int(grid_y0 + r * cell_h)
117
+ cv2.line(canvas, (x0, ly), (x0 + grid_w, ly), GRID_LINE_COLOR, 1)
118
+ for c in range(gc + 1):
119
+ lx = int(x0 + c * cell_w)
120
+ cv2.line(canvas, (lx, grid_y0), (lx, grid_y0 + grid_h), GRID_LINE_COLOR, 1)
121
+
122
+ font = cv2.FONT_HERSHEY_SIMPLEX
123
+ font_scale = 0.8
124
+ thickness = 1
125
+ (tw, th), _ = cv2.getTextSize(label, font, font_scale, thickness)
126
+ tx = x0 + (w - tw) // 2
127
+ ty = y0 + label_h - 4
128
+ cv2.putText(canvas, label, (tx, ty), font, font_scale, LABEL_COLOR, thickness, cv2.LINE_AA)
129
+
130
+
131
+ # ── Layout Calculation ─────────────────────────────────────────────────────────
132
+
133
+ def _compute_layout(task: dict, canvas_h: int, canvas_w: int) -> dict:
134
+ """Compute uniform grid layout for all pairs on the canvas."""
135
+ n_cols = len(task["train"]) + 1
136
+ n_rows = 2
137
+
138
+ padding = 12
139
+ outer_margin = 16
140
+ label_h = 20
141
+
142
+ usable_w = canvas_w - 2 * outer_margin - (n_cols - 1) * padding
143
+ usable_h = canvas_h - 2 * outer_margin - (n_rows - 1) * padding
144
+
145
+ cell_w = usable_w // n_cols
146
+ cell_h = usable_h // n_rows
147
+
148
+ total_block_w = cell_w * n_cols + (n_cols - 1) * padding
149
+ total_block_h = cell_h * n_rows + (n_rows - 1) * padding
150
+ margin_x = (canvas_w - total_block_w) // 2
151
+ margin_y = (canvas_h - total_block_h) // 2
152
+
153
+ return {
154
+ "n_cols": n_cols, "n_rows": n_rows,
155
+ "cell_w": cell_w, "cell_h": cell_h,
156
+ "margin_x": margin_x, "margin_y": margin_y,
157
+ "padding": padding, "label_h": label_h,
158
+ }
159
+
160
+
161
+ # ── Frame Rendering ────────────────────────────────────────────────────────────
162
+
163
+ def render_frame(
164
+ task: dict, test_idx: int, rows_revealed: int | None,
165
+ canvas_h: int = 720, canvas_w: int = 1280,
166
+ ) -> np.ndarray:
167
+ """Render one video frame as an RGB numpy array."""
168
+ canvas = np.full((canvas_h, canvas_w, 3), BG_COLOR, dtype=np.uint8)
169
+ layout = _compute_layout(task, canvas_h, canvas_w)
170
+
171
+ n_cols = layout["n_cols"]
172
+ cell_w, cell_h = layout["cell_w"], layout["cell_h"]
173
+ mx, my, pad = layout["margin_x"], layout["margin_y"], layout["padding"]
174
+
175
+ train_pairs = task["train"]
176
+ test_pair = task["test"][test_idx]
177
+
178
+ for col in range(n_cols):
179
+ x0 = mx + col * (cell_w + pad)
180
+
181
+ if col < len(train_pairs):
182
+ inp = np.array(train_pairs[col]["input"])
183
+ out = np.array(train_pairs[col]["output"])
184
+ _render_grid_to_region(canvas, inp, x0, my, cell_w, cell_h, f"Train {col+1} In")
185
+ y1 = my + cell_h + pad
186
+ _render_grid_to_region(canvas, out, x0, y1, cell_w, cell_h, f"Train {col+1} Out")
187
+ else:
188
+ test_in = np.array(test_pair["input"])
189
+ _render_grid_to_region(canvas, test_in, x0, my, cell_w, cell_h, "Test In")
190
+ test_out = np.array(test_pair["output"])
191
+ y1 = my + cell_h + pad
192
+ reveal = 0 if rows_revealed is None else rows_revealed
193
+ _render_grid_to_region(canvas, test_out, x0, y1, cell_w, cell_h, "Test Out", rows_revealed=reveal)
194
+
195
+ return canvas
196
+
197
+
198
+ # ── Video Generation ───────────────────────────────────────────────────────────
199
+
200
+ def generate_video(
201
+ task: dict, output_path: str,
202
+ n_frames: int = 5, m_frames: int = 5, k_rate: float = 1.0,
203
+ max_frames: int | None = None, fps: int = 15,
204
+ canvas_h: int = 720, canvas_w: int = 1280,
205
+ ) -> int:
206
+ """Generate a single ARC task video. Returns total frame count."""
207
+ test_out = np.array(task["test"][0]["output"])
208
+ total_rows = test_out.shape[0]
209
+
210
+ reveal_frames_natural = int(math.ceil(total_rows * k_rate))
211
+ total_natural = n_frames + reveal_frames_natural + m_frames
212
+
213
+ if max_frames is not None and total_natural > max_frames:
214
+ available_reveal = max(1, max_frames - n_frames - m_frames)
215
+ effective_k = available_reveal / total_rows
216
+ reveal_frames = available_reveal
217
+ else:
218
+ effective_k = k_rate
219
+ reveal_frames = reveal_frames_natural
220
+
221
+ total_frames = n_frames + reveal_frames + m_frames
222
+
223
+ h = canvas_h if canvas_h % 2 == 0 else canvas_h + 1
224
+ w = canvas_w if canvas_w % 2 == 0 else canvas_w + 1
225
+
226
+ Path(output_path).parent.mkdir(parents=True, exist_ok=True)
227
+ fourcc = cv2.VideoWriter_fourcc(*"mp4v")
228
+ writer = cv2.VideoWriter(output_path, fourcc, fps, (w, h))
229
+
230
+ def _write(frame_rgb: np.ndarray) -> None:
231
+ writer.write(cv2.cvtColor(frame_rgb, cv2.COLOR_RGB2BGR))
232
+
233
+ pbar = tqdm(total=total_frames, desc=" Frames", leave=False, unit="f")
234
+
235
+ # Phase 1: Placeholder
236
+ placeholder = render_frame(task, 0, None, h, w)
237
+ for _ in range(n_frames):
238
+ _write(placeholder)
239
+ pbar.update(1)
240
+
241
+ # Phase 2: Progressive reveal
242
+ if effective_k >= 1:
243
+ frames_per_row = effective_k
244
+ row_cursor = 0
245
+ accumulated = 0.0
246
+ for _ in range(reveal_frames):
247
+ accumulated += 1.0
248
+ if accumulated >= frames_per_row and row_cursor < total_rows:
249
+ row_cursor += 1
250
+ accumulated -= frames_per_row
251
+ _write(render_frame(task, 0, row_cursor, h, w))
252
+ pbar.update(1)
253
+ else:
254
+ rows_per_frame = 1.0 / effective_k
255
+ row_accum = 0.0
256
+ for _ in range(reveal_frames):
257
+ row_accum += rows_per_frame
258
+ rows_shown = min(int(math.ceil(row_accum)), total_rows)
259
+ _write(render_frame(task, 0, rows_shown, h, w))
260
+ pbar.update(1)
261
+
262
+ # Phase 3: Full answer
263
+ full = render_frame(task, 0, total_rows, h, w)
264
+ for _ in range(m_frames):
265
+ _write(full)
266
+ pbar.update(1)
267
+
268
+ pbar.close()
269
+ writer.release()
270
+ return total_frames
271
+
272
+
273
+ # ── Metadata Cache ──────────────────────���──────────────────────────────────────
274
+
275
+ METADATA_FILE = ".metadata.json"
276
+
277
+
278
+ def _build_params_dict(
279
+ data_dir: str, n_frames: int, m_frames: int, k_rate: float,
280
+ max_frames: int | None, fps: int, repeat_num: int,
281
+ canvas_h: int, canvas_w: int,
282
+ ) -> dict:
283
+ """Build a JSON-serializable dict of generation parameters."""
284
+ return {
285
+ "data_dir": str(Path(data_dir).resolve()),
286
+ "n_frames": n_frames, "m_frames": m_frames,
287
+ "k_rate": k_rate, "max_frames": max_frames,
288
+ "fps": fps, "repeat_num": repeat_num,
289
+ "canvas_h": canvas_h, "canvas_w": canvas_w,
290
+ }
291
+
292
+
293
+ def _load_metadata(out_path: Path) -> dict | None:
294
+ meta_path = out_path / METADATA_FILE
295
+ if not meta_path.exists():
296
+ return None
297
+ try:
298
+ with open(meta_path) as f:
299
+ return json.load(f)
300
+ except (json.JSONDecodeError, OSError):
301
+ return None
302
+
303
+
304
+ def _save_metadata(out_path: Path, params: dict, completed: set[str]) -> None:
305
+ meta = {"params": params, "completed": sorted(completed)}
306
+ tmp_path = (out_path / METADATA_FILE).with_suffix(".tmp")
307
+ with open(tmp_path, "w") as f:
308
+ json.dump(meta, f, indent=2)
309
+ tmp_path.replace(out_path / METADATA_FILE)
310
+
311
+
312
+ def _clear_output_dir(out_path: Path) -> None:
313
+ if out_path.exists():
314
+ for mp4 in out_path.glob("*.mp4"):
315
+ mp4.unlink()
316
+ meta = out_path / METADATA_FILE
317
+ if meta.exists():
318
+ meta.unlink()
319
+
320
+
321
+ def compute_test_output_bbox(task: dict, canvas_h: int, canvas_w: int) -> dict:
322
+ """Compute the pixel bounding box of the test output cell."""
323
+ n_cols = len(task["train"]) + 1
324
+ n_rows = 2
325
+ padding = 12
326
+ outer_margin = 16
327
+ label_h = 20
328
+
329
+ usable_w = canvas_w - 2 * outer_margin - (n_cols - 1) * padding
330
+ usable_h = canvas_h - 2 * outer_margin - (n_rows - 1) * padding
331
+ cell_w = usable_w // n_cols
332
+ cell_h = usable_h // n_rows
333
+
334
+ total_block_w = cell_w * n_cols + (n_cols - 1) * padding
335
+ total_block_h = cell_h * n_rows + (n_rows - 1) * padding
336
+ margin_x = (canvas_w - total_block_w) // 2
337
+ margin_y = (canvas_h - total_block_h) // 2
338
+
339
+ col = n_cols - 1
340
+ x0 = margin_x + col * (cell_w + padding)
341
+ y0 = margin_y + cell_h + padding
342
+
343
+ test_out = np.array(task["test"][0]["output"])
344
+ gr, gc = test_out.shape
345
+
346
+ return {
347
+ "grid_rows": gr, "grid_cols": gc,
348
+ "x0": x0, "y0": y0,
349
+ "grid_x0": x0, "grid_y0": y0 + label_h,
350
+ "grid_w": cell_w, "grid_h": cell_h - label_h,
351
+ "cell_w": cell_w, "cell_h": cell_h,
352
+ }
353
+
354
+
355
+ def save_video_metadata(
356
+ task: dict, perm: list[int], seed: int,
357
+ canvas_h: int, canvas_w: int, meta_path: str,
358
+ ) -> None:
359
+ """Save per-video metadata JSON for evaluation."""
360
+ bbox = compute_test_output_bbox(task, canvas_h, canvas_w)
361
+ permuted_palette = ARC_COLORS[perm].tolist()
362
+
363
+ meta = {
364
+ "seed": seed,
365
+ "color_perm": perm,
366
+ "permuted_palette": permuted_palette,
367
+ "canvas_h": canvas_h,
368
+ "canvas_w": canvas_w,
369
+ **bbox,
370
+ }
371
+ Path(meta_path).parent.mkdir(parents=True, exist_ok=True)
372
+ with open(meta_path, "w") as f:
373
+ json.dump(meta, f, indent=2)
374
+
375
+
376
+ # ── Train/Test Split ───────────────────────────────────────────────────────────
377
+
378
+ def _write_splits(
379
+ all_samples: list[dict],
380
+ out_path: Path,
381
+ train_ratio: float,
382
+ ) -> None:
383
+ """Stratified train/test split by source, write JSONL and CSV files."""
384
+ rng = random.Random(42)
385
+
386
+ by_source: dict[str, list[dict]] = {}
387
+ for s in all_samples:
388
+ by_source.setdefault(s["source"], []).append(s)
389
+
390
+ train_samples, test_samples = [], []
391
+ for source in sorted(by_source):
392
+ group = by_source[source]
393
+ rng.shuffle(group)
394
+ split_idx = int(len(group) * train_ratio)
395
+ train_samples.extend(group[:split_idx])
396
+ test_samples.extend(group[split_idx:])
397
+
398
+ rng.shuffle(train_samples)
399
+ rng.shuffle(test_samples)
400
+
401
+ # JSONL
402
+ for name, samples in [("train", train_samples), ("test", test_samples)]:
403
+ with open(out_path / f"{name}.jsonl", "w") as f:
404
+ for s in samples:
405
+ f.write(json.dumps(s) + "\n")
406
+
407
+ # CSV
408
+ for name, samples in [("train", train_samples), ("test", test_samples)]:
409
+ with open(out_path / f"{name}.csv", "w", newline="", encoding="utf-8") as f:
410
+ writer = csv.writer(f)
411
+ writer.writerow(["video", "meta", "task_id", "source", "prompt"])
412
+ for s in samples:
413
+ writer.writerow([s["video"], s["meta"], s["task_id"], s["source"], s["prompt"]])
414
+
415
+ tqdm.write(f" Split: {len(train_samples)} train / {len(test_samples)} test")
416
+ tqdm.write(f" Written: train.jsonl, test.jsonl, train.csv, test.csv")
417
+
418
+
419
+ # ── Batch Processing ───────────────────────────────────────────────────────────
420
+
421
+ def process_all(
422
+ data_dir: str = "data",
423
+ output_dir: str = "videos",
424
+ n_frames: int = 5,
425
+ m_frames: int = 5,
426
+ k_rate: float = 1.0,
427
+ max_frames: int | None = None,
428
+ fps: int = 15,
429
+ repeat_num: int = 3,
430
+ canvas_h: int = 720,
431
+ canvas_w: int = 1280,
432
+ train_ratio: float = 0.9,
433
+ prompt: str = "Predict the test output grid based on the input-output training examples.",
434
+ ) -> None:
435
+ """Generate videos for all ARC tasks with train/test JSONL splits.
436
+
437
+ Supports resumption via metadata cache. After generation, writes
438
+ stratified train.jsonl / test.jsonl / CSV files.
439
+ """
440
+ data_path = Path(data_dir)
441
+ out_path = Path(output_dir)
442
+ out_path.mkdir(parents=True, exist_ok=True)
443
+
444
+ current_params = _build_params_dict(
445
+ data_dir, n_frames, m_frames, k_rate, max_frames, fps, repeat_num,
446
+ canvas_h, canvas_w,
447
+ )
448
+ existing_meta = _load_metadata(out_path)
449
+
450
+ if existing_meta is not None and existing_meta.get("params") == current_params:
451
+ completed: set[str] = {
452
+ name for name in existing_meta.get("completed", [])
453
+ if (out_path / name).exists()
454
+ }
455
+ tqdm.write(f"Resuming: {len(completed)} videos already completed.")
456
+ else:
457
+ if existing_meta is not None:
458
+ tqdm.write("Parameters changed β€” clearing and restarting.")
459
+ _clear_output_dir(out_path)
460
+ completed = set()
461
+ _save_metadata(out_path, current_params, completed)
462
+
463
+ task_files = sorted(
464
+ list((data_path / "training").glob("*.json"))
465
+ + list((data_path / "evaluation").glob("*.json"))
466
+ )
467
+ if not task_files:
468
+ print(f"No task files found in {data_path}/training or {data_path}/evaluation")
469
+ return
470
+
471
+ total = len(task_files) * repeat_num
472
+ pbar = tqdm(total=total, desc="Tasks", unit="vid", initial=len(completed))
473
+ save_every = 20
474
+ new_since_save = 0
475
+ all_samples: list[dict] = []
476
+
477
+ for fpath in task_files:
478
+ task_id = fpath.stem
479
+ source = fpath.parent.name # "training" or "evaluation"
480
+ with open(fpath) as f:
481
+ task_raw = json.load(f)
482
+
483
+ if not task_raw.get("test") or "output" not in task_raw["test"][0]:
484
+ pbar.update(repeat_num)
485
+ continue
486
+
487
+ test_out_arr = np.array(task_raw["test"][0]["output"])
488
+ grid_rows, grid_cols = test_out_arr.shape
489
+
490
+ used_perms: set[tuple[int, ...]] = set()
491
+ seed = 0
492
+ generated = 0
493
+
494
+ while generated < repeat_num:
495
+ perm = generate_color_permutation(seed)
496
+ perm_key = tuple(perm)
497
+
498
+ if perm_key not in used_perms:
499
+ used_perms.add(perm_key)
500
+ video_name = f"{task_id}_{seed}.mp4"
501
+ meta_name = f"{task_id}_{seed}.meta.json"
502
+
503
+ sample_meta = {
504
+ "task_id": task_id,
505
+ "source": source,
506
+ "seed": seed,
507
+ "video": video_name,
508
+ "meta": meta_name,
509
+ "prompt": prompt,
510
+ "grid_rows": int(grid_rows),
511
+ "grid_cols": int(grid_cols),
512
+ "color_perm": perm,
513
+ "n_train_pairs": len(task_raw["train"]),
514
+ }
515
+
516
+ if video_name not in completed:
517
+ permuted_task = permute_task(task_raw, perm)
518
+ pbar.set_postfix_str(f"{task_id}_{seed}")
519
+ video_file = str(out_path / video_name)
520
+
521
+ frame_count = generate_video(
522
+ permuted_task, video_file,
523
+ n_frames=n_frames, m_frames=m_frames, k_rate=k_rate,
524
+ max_frames=max_frames, fps=fps,
525
+ canvas_h=canvas_h, canvas_w=canvas_w,
526
+ )
527
+ sample_meta["frame_count"] = frame_count
528
+
529
+ meta_file = video_file.replace(".mp4", ".meta.json")
530
+ save_video_metadata(
531
+ task=permuted_task, perm=perm, seed=seed,
532
+ canvas_h=canvas_h, canvas_w=canvas_w, meta_path=meta_file,
533
+ )
534
+
535
+ completed.add(video_name)
536
+ pbar.update(1)
537
+ new_since_save += 1
538
+
539
+ if new_since_save >= save_every:
540
+ _save_metadata(out_path, current_params, completed)
541
+ new_since_save = 0
542
+
543
+ all_samples.append(sample_meta)
544
+ generated += 1
545
+
546
+ seed += 1
547
+ if seed > repeat_num + 1000:
548
+ tqdm.write(f"Warning: could not generate {repeat_num} unique perms for {task_id}")
549
+ pbar.update(repeat_num - generated)
550
+ break
551
+
552
+ pbar.close()
553
+ _save_metadata(out_path, current_params, completed)
554
+
555
+ # Write train/test splits
556
+ _write_splits(all_samples, out_path, train_ratio)
557
+
558
+ tqdm.write(f"Done. {len(completed)} videos, {len(all_samples)} samples in {out_path}/")
559
+
560
+
561
+ # ── CLI ────────────────────────────────────────────────────────────────────────
562
+
563
+ def parse_args() -> argparse.Namespace:
564
+ p = argparse.ArgumentParser(description="ARC-AGI-2 Video Generator")
565
+ p.add_argument("--data_dir", type=str, default="ARC-AGI-2/data")
566
+ p.add_argument("--output_dir", type=str, default="videos")
567
+ p.add_argument("--n_frames", type=int, default=5)
568
+ p.add_argument("--m_frames", type=int, default=5)
569
+ p.add_argument("--k_rate", type=float, default=1.0)
570
+ p.add_argument("--max_frames", type=int, default=None)
571
+ p.add_argument("--fps", type=int, default=15)
572
+ p.add_argument("--repeat_num", type=int, default=3)
573
+ p.add_argument("--resolution", type=int, nargs=2, default=[720, 1280],
574
+ metavar=("H", "W"))
575
+ p.add_argument("--train_ratio", type=float, default=0.9,
576
+ help="Train split ratio (default: 0.9)")
577
+ p.add_argument("--prompt", type=str,
578
+ default="Predict the test output grid based on the input-output training examples.")
579
+ return p.parse_args()
580
+
581
+
582
+ if __name__ == "__main__":
583
+ args = parse_args()
584
+ process_all(
585
+ data_dir=args.data_dir,
586
+ output_dir=args.output_dir,
587
+ n_frames=args.n_frames,
588
+ m_frames=args.m_frames,
589
+ k_rate=args.k_rate,
590
+ max_frames=args.max_frames,
591
+ fps=args.fps,
592
+ repeat_num=args.repeat_num,
593
+ canvas_h=args.resolution[0],
594
+ canvas_w=args.resolution[1],
595
+ train_ratio=args.train_ratio,
596
+ prompt=args.prompt,
597
+ )