import os import cv2 import lmdb import numpy as np from paddle.io import Dataset from .imaug import create_operators, transform class LMDBDataSet(Dataset): def __init__(self, config, mode, logger, seed=None): super(LMDBDataSet, self).__init__() global_config = config["Global"] dataset_config = config[mode]["dataset"] loader_config = config[mode]["loader"] batch_size = loader_config["batch_size_per_card"] data_dir = dataset_config["data_dir"] self.do_shuffle = loader_config["shuffle"] self.lmdb_sets = self.load_hierarchical_lmdb_dataset(data_dir) logger.info("Initialize indexs of datasets:%s" % data_dir) self.data_idx_order_list = self.dataset_traversal() if self.do_shuffle: np.random.shuffle(self.data_idx_order_list) self.ops = create_operators(dataset_config["transforms"], global_config) self.ext_op_transform_idx = dataset_config.get("ext_op_transform_idx", 2) ratio_list = dataset_config.get("ratio_list", [1.0]) self.need_reset = True in [x < 1 for x in ratio_list] def load_hierarchical_lmdb_dataset(self, data_dir): lmdb_sets = {} dataset_idx = 0 for dirpath, dirnames, filenames in os.walk(data_dir + "/"): if not dirnames: env = lmdb.open( dirpath, max_readers=32, readonly=True, lock=False, readahead=False, meminit=False, ) txn = env.begin(write=False) num_samples = int(txn.get("num-samples".encode())) lmdb_sets[dataset_idx] = { "dirpath": dirpath, "env": env, "txn": txn, "num_samples": num_samples, } dataset_idx += 1 return lmdb_sets def dataset_traversal(self): lmdb_num = len(self.lmdb_sets) total_sample_num = 0 for lno in range(lmdb_num): total_sample_num += self.lmdb_sets[lno]["num_samples"] data_idx_order_list = np.zeros((total_sample_num, 2)) beg_idx = 0 for lno in range(lmdb_num): tmp_sample_num = self.lmdb_sets[lno]["num_samples"] end_idx = beg_idx + tmp_sample_num data_idx_order_list[beg_idx:end_idx, 0] = lno data_idx_order_list[beg_idx:end_idx, 1] = list(range(tmp_sample_num)) data_idx_order_list[beg_idx:end_idx, 1] += 1 beg_idx = beg_idx + tmp_sample_num return data_idx_order_list def get_img_data(self, value): """get_img_data""" if not value: return None imgdata = np.frombuffer(value, dtype="uint8") if imgdata is None: return None imgori = cv2.imdecode(imgdata, 1) if imgori is None: return None return imgori def get_ext_data(self): ext_data_num = 0 for op in self.ops: if hasattr(op, "ext_data_num"): ext_data_num = getattr(op, "ext_data_num") break load_data_ops = self.ops[: self.ext_op_transform_idx] ext_data = [] while len(ext_data) < ext_data_num: lmdb_idx, file_idx = self.data_idx_order_list[ np.random.randint(self.__len__()) ] lmdb_idx = int(lmdb_idx) file_idx = int(file_idx) sample_info = self.get_lmdb_sample_info( self.lmdb_sets[lmdb_idx]["txn"], file_idx ) if sample_info is None: continue img, label = sample_info data = {"image": img, "label": label} outs = transform(data, load_data_ops) ext_data.append(data) return ext_data def get_lmdb_sample_info(self, txn, index): label_key = "label-%09d".encode() % index label = txn.get(label_key) if label is None: return None label = label.decode("utf-8") img_key = "image-%09d".encode() % index imgbuf = txn.get(img_key) return imgbuf, label def __getitem__(self, idx): lmdb_idx, file_idx = self.data_idx_order_list[idx] lmdb_idx = int(lmdb_idx) file_idx = int(file_idx) sample_info = self.get_lmdb_sample_info( self.lmdb_sets[lmdb_idx]["txn"], file_idx ) if sample_info is None: return self.__getitem__(np.random.randint(self.__len__())) img, label = sample_info data = {"image": img, "label": label} data["ext_data"] = self.get_ext_data() outs = transform(data, self.ops) if outs is None: return self.__getitem__(np.random.randint(self.__len__())) return outs def __len__(self): return self.data_idx_order_list.shape[0]