#!/usr/bin/env python3 | |
""" | |
Generate a JSON file listing exactly the SA-1B image paths | |
that have LayoutSAM annotations. | |
""" | |
import json | |
from datasets import load_dataset | |
def main(): | |
# 1. Load the LayoutSAM “train” split | |
# This gives us a column "image_path" with values like "sa_000000/sa_10000.jpg" | |
ds = load_dataset("HuiZhang0812/LayoutSAM", split="train") | |
# 2. Extract, dedupe, and sort the image_path entries | |
image_paths = sorted({row["image_path"] for row in ds}) | |
# 3. Write out to JSON for the SA-1B-Downloader to consume | |
out_file = "images_to_download.json" | |
with open(out_file, "w") as f: | |
json.dump(image_paths, f, indent=2) | |
print(f"✅ Wrote {len(image_paths)} paths to {out_file}") | |
if __name__ == "__main__": | |
main() |