|
import cv2 |
|
import lmdb |
|
import sys |
|
from multiprocessing import Pool |
|
from os import path as osp |
|
from tqdm import tqdm |
|
|
|
|
|
def make_lmdb_from_imgs(data_path, |
|
lmdb_path, |
|
img_path_list, |
|
keys, |
|
batch=5000, |
|
compress_level=1, |
|
multiprocessing_read=False, |
|
n_thread=40, |
|
map_size=None): |
|
"""Make lmdb from images. |
|
|
|
Contents of lmdb. The file structure is: |
|
|
|
:: |
|
|
|
example.lmdb |
|
├── data.mdb |
|
├── lock.mdb |
|
├── meta_info.txt |
|
|
|
The data.mdb and lock.mdb are standard lmdb files and you can refer to |
|
https://lmdb.readthedocs.io/en/release/ for more details. |
|
|
|
The meta_info.txt is a specified txt file to record the meta information |
|
of our datasets. It will be automatically created when preparing |
|
datasets by our provided dataset tools. |
|
Each line in the txt file records 1)image name (with extension), |
|
2)image shape, and 3)compression level, separated by a white space. |
|
|
|
For example, the meta information could be: |
|
`000_00000000.png (720,1280,3) 1`, which means: |
|
1) image name (with extension): 000_00000000.png; |
|
2) image shape: (720,1280,3); |
|
3) compression level: 1 |
|
|
|
We use the image name without extension as the lmdb key. |
|
|
|
If `multiprocessing_read` is True, it will read all the images to memory |
|
using multiprocessing. Thus, your server needs to have enough memory. |
|
|
|
Args: |
|
data_path (str): Data path for reading images. |
|
lmdb_path (str): Lmdb save path. |
|
img_path_list (str): Image path list. |
|
keys (str): Used for lmdb keys. |
|
batch (int): After processing batch images, lmdb commits. |
|
Default: 5000. |
|
compress_level (int): Compress level when encoding images. Default: 1. |
|
multiprocessing_read (bool): Whether use multiprocessing to read all |
|
the images to memory. Default: False. |
|
n_thread (int): For multiprocessing. |
|
map_size (int | None): Map size for lmdb env. If None, use the |
|
estimated size from images. Default: None |
|
""" |
|
|
|
assert len(img_path_list) == len(keys), ('img_path_list and keys should have the same length, ' |
|
f'but got {len(img_path_list)} and {len(keys)}') |
|
print(f'Create lmdb for {data_path}, save to {lmdb_path}...') |
|
print(f'Totoal images: {len(img_path_list)}') |
|
if not lmdb_path.endswith('.lmdb'): |
|
raise ValueError("lmdb_path must end with '.lmdb'.") |
|
if osp.exists(lmdb_path): |
|
print(f'Folder {lmdb_path} already exists. Exit.') |
|
sys.exit(1) |
|
|
|
if multiprocessing_read: |
|
|
|
dataset = {} |
|
shapes = {} |
|
print(f'Read images with multiprocessing, #thread: {n_thread} ...') |
|
pbar = tqdm(total=len(img_path_list), unit='image') |
|
|
|
def callback(arg): |
|
"""get the image data and update pbar.""" |
|
key, dataset[key], shapes[key] = arg |
|
pbar.update(1) |
|
pbar.set_description(f'Read {key}') |
|
|
|
pool = Pool(n_thread) |
|
for path, key in zip(img_path_list, keys): |
|
pool.apply_async(read_img_worker, args=(osp.join(data_path, path), key, compress_level), callback=callback) |
|
pool.close() |
|
pool.join() |
|
pbar.close() |
|
print(f'Finish reading {len(img_path_list)} images.') |
|
|
|
|
|
if map_size is None: |
|
|
|
img = cv2.imread(osp.join(data_path, img_path_list[0]), cv2.IMREAD_UNCHANGED) |
|
_, img_byte = cv2.imencode('.png', img, [cv2.IMWRITE_PNG_COMPRESSION, compress_level]) |
|
data_size_per_img = img_byte.nbytes |
|
print('Data size per image is: ', data_size_per_img) |
|
data_size = data_size_per_img * len(img_path_list) |
|
map_size = data_size * 10 |
|
|
|
env = lmdb.open(lmdb_path, map_size=map_size) |
|
|
|
|
|
pbar = tqdm(total=len(img_path_list), unit='chunk') |
|
txn = env.begin(write=True) |
|
txt_file = open(osp.join(lmdb_path, 'meta_info.txt'), 'w') |
|
for idx, (path, key) in enumerate(zip(img_path_list, keys)): |
|
pbar.update(1) |
|
pbar.set_description(f'Write {key}') |
|
key_byte = key.encode('ascii') |
|
if multiprocessing_read: |
|
img_byte = dataset[key] |
|
h, w, c = shapes[key] |
|
else: |
|
_, img_byte, img_shape = read_img_worker(osp.join(data_path, path), key, compress_level) |
|
h, w, c = img_shape |
|
|
|
txn.put(key_byte, img_byte) |
|
|
|
txt_file.write(f'{key}.png ({h},{w},{c}) {compress_level}\n') |
|
if idx % batch == 0: |
|
txn.commit() |
|
txn = env.begin(write=True) |
|
pbar.close() |
|
txn.commit() |
|
env.close() |
|
txt_file.close() |
|
print('\nFinish writing lmdb.') |
|
|
|
|
|
def read_img_worker(path, key, compress_level): |
|
"""Read image worker. |
|
|
|
Args: |
|
path (str): Image path. |
|
key (str): Image key. |
|
compress_level (int): Compress level when encoding images. |
|
|
|
Returns: |
|
str: Image key. |
|
byte: Image byte. |
|
tuple[int]: Image shape. |
|
""" |
|
|
|
img = cv2.imread(path, cv2.IMREAD_UNCHANGED) |
|
if img.ndim == 2: |
|
h, w = img.shape |
|
c = 1 |
|
else: |
|
h, w, c = img.shape |
|
_, img_byte = cv2.imencode('.png', img, [cv2.IMWRITE_PNG_COMPRESSION, compress_level]) |
|
return (key, img_byte, (h, w, c)) |
|
|
|
|
|
class LmdbMaker(): |
|
"""LMDB Maker. |
|
|
|
Args: |
|
lmdb_path (str): Lmdb save path. |
|
map_size (int): Map size for lmdb env. Default: 1024 ** 4, 1TB. |
|
batch (int): After processing batch images, lmdb commits. |
|
Default: 5000. |
|
compress_level (int): Compress level when encoding images. Default: 1. |
|
""" |
|
|
|
def __init__(self, lmdb_path, map_size=1024**4, batch=5000, compress_level=1): |
|
if not lmdb_path.endswith('.lmdb'): |
|
raise ValueError("lmdb_path must end with '.lmdb'.") |
|
if osp.exists(lmdb_path): |
|
print(f'Folder {lmdb_path} already exists. Exit.') |
|
sys.exit(1) |
|
|
|
self.lmdb_path = lmdb_path |
|
self.batch = batch |
|
self.compress_level = compress_level |
|
self.env = lmdb.open(lmdb_path, map_size=map_size) |
|
self.txn = self.env.begin(write=True) |
|
self.txt_file = open(osp.join(lmdb_path, 'meta_info.txt'), 'w') |
|
self.counter = 0 |
|
|
|
def put(self, img_byte, key, img_shape): |
|
self.counter += 1 |
|
key_byte = key.encode('ascii') |
|
self.txn.put(key_byte, img_byte) |
|
|
|
h, w, c = img_shape |
|
self.txt_file.write(f'{key}.png ({h},{w},{c}) {self.compress_level}\n') |
|
if self.counter % self.batch == 0: |
|
self.txn.commit() |
|
self.txn = self.env.begin(write=True) |
|
|
|
def close(self): |
|
self.txn.commit() |
|
self.env.close() |
|
self.txt_file.close() |
|
|