import json import os from tqdm import tqdm import torch from torchvision.io import ImageReadMode, read_image JOINT_JSON_DIRECTORY = f"/home/{os.environ['USER']}/data/wit/all_jsons" SCALE_CONVERTED_DIRECTORY = f"/home/{os.environ['USER']}/data/wit_scale_converted" for split in ["train", "valid", "test"]: print("Reading json") with open(f"{JOINT_JSON_DIRECTORY}/{split}_dataset_all_98_1_1_split.json") as f: examples = [json.loads(line) for line in f.readlines()] valid_files = set(os.listdir(SCALE_CONVERTED_DIRECTORY)) supported_examples = [] for example in tqdm(examples): directory, filename = os.path.split(example['image_path']) if filename in valid_files: example["image_path"] = os.path.join(SCALE_CONVERTED_DIRECTORY, filename) supported_examples.append(json.dumps(example, ensure_ascii=False)) print(f"Total {split} examples: {len(supported_examples)}") with open(f"{SCALE_CONVERTED_DIRECTORY}/{split}_dataset_scale_converted_98_1_1_split.json", "w") as f: f.write("\n".join(supported_examples)) print("DONE!")