|
import json |
|
import os |
|
import argparse |
|
from tqdm import tqdm |
|
import yaml |
|
|
|
|
|
def check_missing_images(json_path, images_folder): |
|
data = json.load(open(json_path, "r")) |
|
missing_data = [] |
|
|
|
for i, d in enumerate(tqdm(data)): |
|
image = d["image"] if "image" in d else "" |
|
if image != "": |
|
path = os.path.join(images_folder, image) |
|
if not os.path.exists(path): |
|
print(f"Missing image: {path}") |
|
missing_data.append(d) |
|
|
|
return missing_data |
|
|
|
|
|
def read_yaml_to_llava_data(yaml_path, images_folder): |
|
print(f"Reading YAML file: {yaml_path}") |
|
with open(yaml_path, "r") as f: |
|
data = yaml.safe_load(f) |
|
|
|
llava_json_paths = data["datasets"] |
|
for item in llava_json_paths: |
|
json_path = item["json_path"] |
|
missing_data = check_missing_images(json_path, images_folder) |
|
if len(missing_data) > 0: |
|
print(f"Missing images in {json_path}:") |
|
for d in missing_data: |
|
print(d) |
|
|
|
|
|
def direct_check_llava_data(json_path, images_folder): |
|
missing_data = check_missing_images(json_path, images_folder) |
|
if len(missing_data) > 0: |
|
print(f"Missing images in {json_path}:") |
|
for d in missing_data: |
|
print(d) |
|
|
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser(description="Check for missing images in dataset.") |
|
parser.add_argument("--yaml_path", type=str, default="", help="Path to the YAML file containing the dataset.") |
|
parser.add_argument("--json_path", type=str, default="", help="Path to the JSON file containing the dataset.") |
|
parser.add_argument("--images_folder", type=str, default="/mnt/bn/vl-research/data/llava_data", help="Path to the folder containing the images.") |
|
|
|
args = parser.parse_args() |
|
|
|
if args.json_path != "": |
|
direct_check_llava_data(args.json_path, args.images_folder) |
|
elif args.yaml_path != "": |
|
read_yaml_to_llava_data(args.yaml_path, args.images_folder) |
|
|