#!/usr/bin/env python3 """ Generate a JSON file listing exactly the SA-1B image paths that have LayoutSAM annotations. """ import json from datasets import load_dataset def main(): # 1. Load the LayoutSAM “train” split # This gives us a column "image_path" with values like "sa_000000/sa_10000.jpg" ds = load_dataset("HuiZhang0812/LayoutSAM", split="train") # 2. Extract, dedupe, and sort the image_path entries image_paths = sorted({row["image_path"] for row in ds}) # 3. Write out to JSON for the SA-1B-Downloader to consume out_file = "images_to_download.json" with open(out_file, "w") as f: json.dump(image_paths, f, indent=2) print(f"✅ Wrote {len(image_paths)} paths to {out_file}") if __name__ == "__main__": main()