Spaces:
Runtime error
Runtime error
File size: 4,567 Bytes
bb5cd12 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
from PIL import Image
from PIL import UnidentifiedImageError
import os
import json
from pathlib import Path
from tqdm import tqdm
import shutil
def save_to_jsons(data_list, target_dir, starting_idx=0):
pbar = tqdm(
enumerate(data_list), desc=f"saving {len(data_list)} jsons to {str(target_dir)}"
)
for k, data in pbar:
filename = Path(target_dir) / Path(f"{k+starting_idx}.json")
with open(filename, "w") as f:
json.dump(data, f)
return None
def save_images(img_list, target_dir, mode="mv"):
for img_path in tqdm(
img_list,
desc=f"saving {len(img_list)} images (mode={mode}) to {str(target_dir)}",
):
if mode == "mv":
shutil.move(img_path, target_dir)
elif mode == "cp":
shutil.copy(img_path, target_dir)
def convert_dataset(
data_dir,
dir_size=10000,
hash_fn=None,
mode="mv",
ds_iterator=None,
):
"""
Builds a dataset directory in our standard format. ds_iterator should return data of the form
image_path, {"captions": [...], "metadata": {...}, }, where image_path should be a Path object, captions should map to a list of strings
and metadata can contain any custom data about the image. If a hash_fn is specified (such as phash), the image hash gets saved in metadata.
"""
data_dir = Path(data_dir)
# folders for images and corresponding data which is stored in a json file for each image
os.makedirs(data_dir / "images", exist_ok=True)
os.makedirs(data_dir / "image_data", exist_ok=True)
img_data_list = []
img_path_list = []
save_img_dir = data_dir / "images" / "0"
save_data_dir = data_dir / "image_data" / "0"
num_img_dirs = 0
# save the new locations of all img files in case some datafiles point to the same image
new_img_locations = {}
pbar = tqdm(
enumerate(ds_iterator),
desc="converting dataset to standard format...",
)
for k, (img_path, data) in pbar:
img_cpt_data = {}
# get img data
img_cpt_data.update(data)
if str(img_path) in new_img_locations.keys():
# if filename is in the dictionary, it already has a new location
new_img_path = new_img_locations[str(img_path)]["new_img_path"]
img_cpt_data["image_path"] = new_img_path
if hash_fn is not None:
img_cpt_data["metadata"]["image_hash"] = new_img_locations[
str(img_path)
]["hash"]
else:
# if file exists in the old location, it will get moved to a new directory
new_img_path = f"images/{save_img_dir.name}/{img_path.name}"
img_cpt_data["image_path"] = new_img_path
new_img_locations[str(img_path)] = {"new_img_path": new_img_path}
# original location is saved an later saved to the new directory
img_path_list.append(img_path)
# if given, apply hash fn
if hash_fn is not None:
try:
img = Image.open(img_path).convert("RGB")
hash_str = str(hash_fn(img))
img_cpt_data["metadata"]["image_hash"] = hash_str
# save hash so it does not have to be recomputed
new_img_locations[str(img_path)]["hash"] = hash_str
except (UnidentifiedImageError, FileNotFoundError):
print("Warning: corrupted or non-existent Image")
img_data_list.append(img_cpt_data)
# save images in specified images folder (maximum of dir_size images per folder)
if (len(img_path_list) % dir_size == 0 and len(img_path_list) > 0) or (
k == len(ds_iterator) - 1
):
os.makedirs(save_img_dir, exist_ok=True)
save_images(img_path_list, save_img_dir, mode=mode)
img_path_list = []
num_img_dirs += 1
save_img_dir = data_dir / "images" / f"{num_img_dirs}/"
# save jdon data in specified image_data folder with consecutive labeling of the json files
if ((k + 1) % dir_size == 0) or (k == len(ds_iterator) - 1):
os.makedirs(save_data_dir, exist_ok=True)
save_to_jsons(
img_data_list, save_data_dir, starting_idx=max(k + 1 - dir_size, 0)
)
# empty path and data lists and update save directories for next saving step
img_data_list = []
save_data_dir = data_dir / "image_data" / f"{int((k+1)/dir_size)}/"
|