import cv2 import lmdb import sys from multiprocessing import Pool from os import path as osp from tqdm import tqdm def make_lmdb_from_imgs(data_path, lmdb_path, img_path_list, keys, batch=5000, compress_level=1, multiprocessing_read=False, n_thread=40, map_size=None): """Make lmdb from images. Contents of lmdb. The file structure is: :: example.lmdb ├── data.mdb ├── lock.mdb ├── meta_info.txt The data.mdb and lock.mdb are standard lmdb files and you can refer to https://lmdb.readthedocs.io/en/release/ for more details. The meta_info.txt is a specified txt file to record the meta information of our datasets. It will be automatically created when preparing datasets by our provided dataset tools. Each line in the txt file records 1)image name (with extension), 2)image shape, and 3)compression level, separated by a white space. For example, the meta information could be: `000_00000000.png (720,1280,3) 1`, which means: 1) image name (with extension): 000_00000000.png; 2) image shape: (720,1280,3); 3) compression level: 1 We use the image name without extension as the lmdb key. If `multiprocessing_read` is True, it will read all the images to memory using multiprocessing. Thus, your server needs to have enough memory. Args: data_path (str): Data path for reading images. lmdb_path (str): Lmdb save path. img_path_list (str): Image path list. keys (str): Used for lmdb keys. batch (int): After processing batch images, lmdb commits. Default: 5000. compress_level (int): Compress level when encoding images. Default: 1. multiprocessing_read (bool): Whether use multiprocessing to read all the images to memory. Default: False. n_thread (int): For multiprocessing. map_size (int | None): Map size for lmdb env. If None, use the estimated size from images. Default: None """ assert len(img_path_list) == len(keys), ('img_path_list and keys should have the same length, ' f'but got {len(img_path_list)} and {len(keys)}') print(f'Create lmdb for {data_path}, save to {lmdb_path}...') print(f'Totoal images: {len(img_path_list)}') if not lmdb_path.endswith('.lmdb'): raise ValueError("lmdb_path must end with '.lmdb'.") if osp.exists(lmdb_path): print(f'Folder {lmdb_path} already exists. Exit.') sys.exit(1) if multiprocessing_read: # read all the images to memory (multiprocessing) dataset = {} # use dict to keep the order for multiprocessing shapes = {} print(f'Read images with multiprocessing, #thread: {n_thread} ...') pbar = tqdm(total=len(img_path_list), unit='image') def callback(arg): """get the image data and update pbar.""" key, dataset[key], shapes[key] = arg pbar.update(1) pbar.set_description(f'Read {key}') pool = Pool(n_thread) for path, key in zip(img_path_list, keys): pool.apply_async(read_img_worker, args=(osp.join(data_path, path), key, compress_level), callback=callback) pool.close() pool.join() pbar.close() print(f'Finish reading {len(img_path_list)} images.') # create lmdb environment if map_size is None: # obtain data size for one image img = cv2.imread(osp.join(data_path, img_path_list[0]), cv2.IMREAD_UNCHANGED) _, img_byte = cv2.imencode('.png', img, [cv2.IMWRITE_PNG_COMPRESSION, compress_level]) data_size_per_img = img_byte.nbytes print('Data size per image is: ', data_size_per_img) data_size = data_size_per_img * len(img_path_list) map_size = data_size * 10 env = lmdb.open(lmdb_path, map_size=map_size) # write data to lmdb pbar = tqdm(total=len(img_path_list), unit='chunk') txn = env.begin(write=True) txt_file = open(osp.join(lmdb_path, 'meta_info.txt'), 'w') for idx, (path, key) in enumerate(zip(img_path_list, keys)): pbar.update(1) pbar.set_description(f'Write {key}') key_byte = key.encode('ascii') if multiprocessing_read: img_byte = dataset[key] h, w, c = shapes[key] else: _, img_byte, img_shape = read_img_worker(osp.join(data_path, path), key, compress_level) h, w, c = img_shape txn.put(key_byte, img_byte) # write meta information txt_file.write(f'{key}.png ({h},{w},{c}) {compress_level}\n') if idx % batch == 0: txn.commit() txn = env.begin(write=True) pbar.close() txn.commit() env.close() txt_file.close() print('\nFinish writing lmdb.') def read_img_worker(path, key, compress_level): """Read image worker. Args: path (str): Image path. key (str): Image key. compress_level (int): Compress level when encoding images. Returns: str: Image key. byte: Image byte. tuple[int]: Image shape. """ img = cv2.imread(path, cv2.IMREAD_UNCHANGED) if img.ndim == 2: h, w = img.shape c = 1 else: h, w, c = img.shape _, img_byte = cv2.imencode('.png', img, [cv2.IMWRITE_PNG_COMPRESSION, compress_level]) return (key, img_byte, (h, w, c)) class LmdbMaker(): """LMDB Maker. Args: lmdb_path (str): Lmdb save path. map_size (int): Map size for lmdb env. Default: 1024 ** 4, 1TB. batch (int): After processing batch images, lmdb commits. Default: 5000. compress_level (int): Compress level when encoding images. Default: 1. """ def __init__(self, lmdb_path, map_size=1024**4, batch=5000, compress_level=1): if not lmdb_path.endswith('.lmdb'): raise ValueError("lmdb_path must end with '.lmdb'.") if osp.exists(lmdb_path): print(f'Folder {lmdb_path} already exists. Exit.') sys.exit(1) self.lmdb_path = lmdb_path self.batch = batch self.compress_level = compress_level self.env = lmdb.open(lmdb_path, map_size=map_size) self.txn = self.env.begin(write=True) self.txt_file = open(osp.join(lmdb_path, 'meta_info.txt'), 'w') self.counter = 0 def put(self, img_byte, key, img_shape): self.counter += 1 key_byte = key.encode('ascii') self.txn.put(key_byte, img_byte) # write meta information h, w, c = img_shape self.txt_file.write(f'{key}.png ({h},{w},{c}) {self.compress_level}\n') if self.counter % self.batch == 0: self.txn.commit() self.txn = self.env.begin(write=True) def close(self): self.txn.commit() self.env.close() self.txt_file.close()