File size: 1,112 Bytes
8e2b754
 
2daf3c7
8e2b754
 
 
 
2daf3c7
 
8e2b754
 
2daf3c7
 
8e2b754
2daf3c7
 
8e2b754
2daf3c7
 
 
 
8e2b754
 
 
2daf3c7
8e2b754
2daf3c7
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import json
import os
from tqdm import tqdm

import torch
from torchvision.io import ImageReadMode, read_image

JOINT_JSON_DIRECTORY = f"/home/{os.environ['USER']}/data/wit/all_jsons"
SCALE_CONVERTED_DIRECTORY = f"/home/{os.environ['USER']}/data/wit_scale_converted"

for split in ["train", "valid", "test"]:
    print("Reading json")
    with open(f"{JOINT_JSON_DIRECTORY}/{split}_dataset_all_98_1_1_split.json") as f:
        examples = [json.loads(line) for line in f.readlines()]
    valid_files = set(os.listdir(SCALE_CONVERTED_DIRECTORY))

    supported_examples = []
    for example in tqdm(examples):
        directory, filename = os.path.split(example['image_path'])
        if filename in valid_files:
            example["image_path"] = os.path.join(SCALE_CONVERTED_DIRECTORY, filename)
            supported_examples.append(json.dumps(example, ensure_ascii=False))

    print(f"Total {split} examples: {len(supported_examples)}")
    with open(f"{SCALE_CONVERTED_DIRECTORY}/{split}_dataset_scale_converted_98_1_1_split.json", "w") as f:
        f.write("\n".join(supported_examples))

print("DONE!")