clip-spanish / discard_incorrect_files.py
edugp's picture
Add all necessary files to replicate training run
2daf3c7
raw
history blame
No virus
1.11 kB
import json
import os
from tqdm import tqdm
import torch
from torchvision.io import ImageReadMode, read_image
JOINT_JSON_DIRECTORY = f"/home/{os.environ['USER']}/data/wit/all_jsons"
SCALE_CONVERTED_DIRECTORY = f"/home/{os.environ['USER']}/data/wit_scale_converted"
for split in ["train", "valid", "test"]:
print("Reading json")
with open(f"{JOINT_JSON_DIRECTORY}/{split}_dataset_all_98_1_1_split.json") as f:
examples = [json.loads(line) for line in f.readlines()]
valid_files = set(os.listdir(SCALE_CONVERTED_DIRECTORY))
supported_examples = []
for example in tqdm(examples):
directory, filename = os.path.split(example['image_path'])
if filename in valid_files:
example["image_path"] = os.path.join(SCALE_CONVERTED_DIRECTORY, filename)
supported_examples.append(json.dumps(example, ensure_ascii=False))
print(f"Total {split} examples: {len(supported_examples)}")
with open(f"{SCALE_CONVERTED_DIRECTORY}/{split}_dataset_scale_converted_98_1_1_split.json", "w") as f:
f.write("\n".join(supported_examples))
print("DONE!")