File size: 4,567 Bytes
bb5cd12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from PIL import Image
from PIL import UnidentifiedImageError
import os
import json
from pathlib import Path
from tqdm import tqdm
import shutil


def save_to_jsons(data_list, target_dir, starting_idx=0):
    pbar = tqdm(
        enumerate(data_list), desc=f"saving {len(data_list)} jsons to {str(target_dir)}"
    )
    for k, data in pbar:
        filename = Path(target_dir) / Path(f"{k+starting_idx}.json")
        with open(filename, "w") as f:
            json.dump(data, f)

    return None


def save_images(img_list, target_dir, mode="mv"):
    for img_path in tqdm(
        img_list,
        desc=f"saving {len(img_list)} images (mode={mode}) to {str(target_dir)}",
    ):
        if mode == "mv":
            shutil.move(img_path, target_dir)
        elif mode == "cp":
            shutil.copy(img_path, target_dir)


def convert_dataset(
    data_dir,
    dir_size=10000,
    hash_fn=None,
    mode="mv",
    ds_iterator=None,
):
    """
    Builds a dataset directory in our standard format. ds_iterator should return data of the form
    image_path, {"captions": [...], "metadata": {...}, }, where image_path should be a Path object, captions should map to a list of strings
    and metadata can contain any custom data about the image. If a hash_fn is specified (such as phash), the image hash gets saved in metadata.
    """

    data_dir = Path(data_dir)

    # folders for images and corresponding data which is stored in a json file for each image
    os.makedirs(data_dir / "images", exist_ok=True)
    os.makedirs(data_dir / "image_data", exist_ok=True)

    img_data_list = []
    img_path_list = []
    save_img_dir = data_dir / "images" / "0"
    save_data_dir = data_dir / "image_data" / "0"
    num_img_dirs = 0

    # save the new locations of all img files in case some datafiles point to the same image
    new_img_locations = {}

    pbar = tqdm(
        enumerate(ds_iterator),
        desc="converting dataset to standard format...",
    )

    for k, (img_path, data) in pbar:
        img_cpt_data = {}
        # get img data
        img_cpt_data.update(data)

        if str(img_path) in new_img_locations.keys():
            # if filename is in the dictionary, it already has a new location
            new_img_path = new_img_locations[str(img_path)]["new_img_path"]
            img_cpt_data["image_path"] = new_img_path
            if hash_fn is not None:
                img_cpt_data["metadata"]["image_hash"] = new_img_locations[
                    str(img_path)
                ]["hash"]
        else:
            # if file exists in the old location, it will get moved to a new directory
            new_img_path = f"images/{save_img_dir.name}/{img_path.name}"
            img_cpt_data["image_path"] = new_img_path
            new_img_locations[str(img_path)] = {"new_img_path": new_img_path}
            # original location is saved an later saved to the new directory
            img_path_list.append(img_path)

            # if given, apply hash fn
            if hash_fn is not None:
                try:
                    img = Image.open(img_path).convert("RGB")
                    hash_str = str(hash_fn(img))
                    img_cpt_data["metadata"]["image_hash"] = hash_str
                    # save hash so it does not have to be recomputed
                    new_img_locations[str(img_path)]["hash"] = hash_str
                except (UnidentifiedImageError, FileNotFoundError):
                    print("Warning: corrupted or non-existent Image")

        img_data_list.append(img_cpt_data)

        # save images in specified images folder (maximum of dir_size images per folder)
        if (len(img_path_list) % dir_size == 0 and len(img_path_list) > 0) or (
            k == len(ds_iterator) - 1
        ):
            os.makedirs(save_img_dir, exist_ok=True)
            save_images(img_path_list, save_img_dir, mode=mode)
            img_path_list = []
            num_img_dirs += 1
            save_img_dir = data_dir / "images" / f"{num_img_dirs}/"

        # save jdon data in specified image_data folder with consecutive labeling of the json files
        if ((k + 1) % dir_size == 0) or (k == len(ds_iterator) - 1):
            os.makedirs(save_data_dir, exist_ok=True)
            save_to_jsons(
                img_data_list, save_data_dir, starting_idx=max(k + 1 - dir_size, 0)
            )
            # empty path and data lists and update save directories for next saving step
            img_data_list = []
            save_data_dir = data_dir / "image_data" / f"{int((k+1)/dir_size)}/"