llava-uhd-new / scripts /archived /quick_check.py
ZzzHelloWorld's picture
Add files using upload-large-folder tool
ca7c7f9 verified
import json
import os
import argparse
from tqdm import tqdm
import yaml
def check_missing_images(json_path, images_folder):
data = json.load(open(json_path, "r"))
missing_data = []
for i, d in enumerate(tqdm(data)):
image = d["image"] if "image" in d else ""
if image != "":
path = os.path.join(images_folder, image)
if not os.path.exists(path):
print(f"Missing image: {path}")
missing_data.append(d)
return missing_data
def read_yaml_to_llava_data(yaml_path, images_folder):
print(f"Reading YAML file: {yaml_path}")
with open(yaml_path, "r") as f:
data = yaml.safe_load(f)
llava_json_paths = data["datasets"]
for item in llava_json_paths:
json_path = item["json_path"]
missing_data = check_missing_images(json_path, images_folder)
if len(missing_data) > 0:
print(f"Missing images in {json_path}:")
for d in missing_data:
print(d)
def direct_check_llava_data(json_path, images_folder):
missing_data = check_missing_images(json_path, images_folder)
if len(missing_data) > 0:
print(f"Missing images in {json_path}:")
for d in missing_data:
print(d)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Check for missing images in dataset.")
parser.add_argument("--yaml_path", type=str, default="", help="Path to the YAML file containing the dataset.")
parser.add_argument("--json_path", type=str, default="", help="Path to the JSON file containing the dataset.")
parser.add_argument("--images_folder", type=str, default="/mnt/bn/vl-research/data/llava_data", help="Path to the folder containing the images.")
args = parser.parse_args()
if args.json_path != "":
direct_check_llava_data(args.json_path, args.images_folder)
elif args.yaml_path != "":
read_yaml_to_llava_data(args.yaml_path, args.images_folder)