from PIL import Image from PIL import UnidentifiedImageError import os import json from pathlib import Path from tqdm import tqdm import shutil def save_to_jsons(data_list, target_dir, starting_idx=0): pbar = tqdm( enumerate(data_list), desc=f"saving {len(data_list)} jsons to {str(target_dir)}" ) for k, data in pbar: filename = Path(target_dir) / Path(f"{k+starting_idx}.json") with open(filename, "w") as f: json.dump(data, f) return None def save_images(img_list, target_dir, mode="mv"): for img_path in tqdm( img_list, desc=f"saving {len(img_list)} images (mode={mode}) to {str(target_dir)}", ): if mode == "mv": shutil.move(img_path, target_dir) elif mode == "cp": shutil.copy(img_path, target_dir) def convert_dataset( data_dir, dir_size=10000, hash_fn=None, mode="mv", ds_iterator=None, ): """ Builds a dataset directory in our standard format. ds_iterator should return data of the form image_path, {"captions": [...], "metadata": {...}, }, where image_path should be a Path object, captions should map to a list of strings and metadata can contain any custom data about the image. If a hash_fn is specified (such as phash), the image hash gets saved in metadata. """ data_dir = Path(data_dir) # folders for images and corresponding data which is stored in a json file for each image os.makedirs(data_dir / "images", exist_ok=True) os.makedirs(data_dir / "image_data", exist_ok=True) img_data_list = [] img_path_list = [] save_img_dir = data_dir / "images" / "0" save_data_dir = data_dir / "image_data" / "0" num_img_dirs = 0 # save the new locations of all img files in case some datafiles point to the same image new_img_locations = {} pbar = tqdm( enumerate(ds_iterator), desc="converting dataset to standard format...", ) for k, (img_path, data) in pbar: img_cpt_data = {} # get img data img_cpt_data.update(data) if str(img_path) in new_img_locations.keys(): # if filename is in the dictionary, it already has a new location new_img_path = new_img_locations[str(img_path)]["new_img_path"] img_cpt_data["image_path"] = new_img_path if hash_fn is not None: img_cpt_data["metadata"]["image_hash"] = new_img_locations[ str(img_path) ]["hash"] else: # if file exists in the old location, it will get moved to a new directory new_img_path = f"images/{save_img_dir.name}/{img_path.name}" img_cpt_data["image_path"] = new_img_path new_img_locations[str(img_path)] = {"new_img_path": new_img_path} # original location is saved an later saved to the new directory img_path_list.append(img_path) # if given, apply hash fn if hash_fn is not None: try: img = Image.open(img_path).convert("RGB") hash_str = str(hash_fn(img)) img_cpt_data["metadata"]["image_hash"] = hash_str # save hash so it does not have to be recomputed new_img_locations[str(img_path)]["hash"] = hash_str except (UnidentifiedImageError, FileNotFoundError): print("Warning: corrupted or non-existent Image") img_data_list.append(img_cpt_data) # save images in specified images folder (maximum of dir_size images per folder) if (len(img_path_list) % dir_size == 0 and len(img_path_list) > 0) or ( k == len(ds_iterator) - 1 ): os.makedirs(save_img_dir, exist_ok=True) save_images(img_path_list, save_img_dir, mode=mode) img_path_list = [] num_img_dirs += 1 save_img_dir = data_dir / "images" / f"{num_img_dirs}/" # save jdon data in specified image_data folder with consecutive labeling of the json files if ((k + 1) % dir_size == 0) or (k == len(ds_iterator) - 1): os.makedirs(save_data_dir, exist_ok=True) save_to_jsons( img_data_list, save_data_dir, starting_idx=max(k + 1 - dir_size, 0) ) # empty path and data lists and update save directories for next saving step img_data_list = [] save_data_dir = data_dir / "image_data" / f"{int((k+1)/dir_size)}/"