sparse / ms-swift /uni /generate_image_list.py
Enxin's picture
Upload folder using huggingface_hub
96fe658 verified
#!/usr/bin/env python3
"""
Generate a JSON file listing exactly the SA-1B image paths
that have LayoutSAM annotations.
"""
import json
from datasets import load_dataset
def main():
# 1. Load the LayoutSAM “train” split
# This gives us a column "image_path" with values like "sa_000000/sa_10000.jpg"
ds = load_dataset("HuiZhang0812/LayoutSAM", split="train")
# 2. Extract, dedupe, and sort the image_path entries
image_paths = sorted({row["image_path"] for row in ds})
# 3. Write out to JSON for the SA-1B-Downloader to consume
out_file = "images_to_download.json"
with open(out_file, "w") as f:
json.dump(image_paths, f, indent=2)
print(f"✅ Wrote {len(image_paths)} paths to {out_file}")
if __name__ == "__main__":
main()