|
import argparse |
|
import os |
|
import os.path as osp |
|
import lmdb |
|
import pyarrow as pa |
|
import json |
|
from tqdm import tqdm |
|
import warnings |
|
warnings.filterwarnings("ignore") |
|
|
|
|
|
def loads_pyarrow(buf): |
|
""" |
|
Args: |
|
buf: the output of `dumps`. |
|
""" |
|
return pa.deserialize(buf) |
|
|
|
|
|
def raw_reader(path): |
|
with open(path, 'rb') as f: |
|
bin_data = f.read() |
|
return bin_data |
|
|
|
|
|
def dumps_pyarrow(obj): |
|
""" |
|
Serialize an object. |
|
Returns: |
|
Implementation-dependent bytes-like object |
|
""" |
|
return pa.serialize(obj).to_buffer() |
|
|
|
|
|
def folder2lmdb(json_data, img_dir, mask_dir, output_dir, split, write_frequency=1000): |
|
lmdb_path = osp.join(output_dir, "%s.lmdb" % split) |
|
isdir = os.path.isdir(lmdb_path) |
|
|
|
print("Generate LMDB to %s" % lmdb_path) |
|
db = lmdb.open(lmdb_path, subdir=isdir, |
|
map_size=1099511627776 * 2, readonly=False, |
|
meminit=False, map_async=True) |
|
|
|
txn = db.begin(write=True) |
|
tbar = tqdm(json_data) |
|
for idx, item in enumerate(tbar): |
|
img = raw_reader(osp.join(img_dir, item['img_name'])) |
|
mask = raw_reader(osp.join(mask_dir, f"{item['segment_id']}.png")) |
|
data = {'img': img, 'mask': mask, 'cat': item['cat'], |
|
'seg_id': item['segment_id'], 'img_name': item['img_name'], |
|
'num_sents': item['sentences_num'], 'sents': [i['sent'] for i in item['sentences']]} |
|
txn.put(u'{}'.format(idx).encode('ascii'), dumps_pyarrow(data)) |
|
if idx % write_frequency == 0: |
|
|
|
txn.commit() |
|
txn = db.begin(write=True) |
|
|
|
|
|
txn.commit() |
|
keys = [u'{}'.format(k).encode('ascii') for k in range(idx + 1)] |
|
with db.begin(write=True) as txn: |
|
txn.put(b'__keys__', dumps_pyarrow(keys)) |
|
txn.put(b'__len__', dumps_pyarrow(len(keys))) |
|
|
|
print("Flushing database ...") |
|
db.sync() |
|
db.close() |
|
|
|
|
|
def parse_args(): |
|
parser = argparse.ArgumentParser(description='COCO Folder to LMDB.') |
|
parser.add_argument('-j', '--json-dir', type=str, |
|
default='', |
|
help='the name of json file.') |
|
parser.add_argument('-i', '--img-dir', type=str, |
|
default='refcoco+', |
|
help='the folder of images.') |
|
parser.add_argument('-m', '--mask-dir', type=str, |
|
default='refcoco+', |
|
help='the folder of masks.') |
|
parser.add_argument('-o', '--output-dir', type=str, |
|
default='refcoco+', |
|
help='the folder of output lmdb file.') |
|
parser.add_argument('-s', '--split', type=str, |
|
default='train', |
|
help='the split type.') |
|
args = parser.parse_args() |
|
return args |
|
|
|
|
|
if __name__ == '__main__': |
|
args = parse_args() |
|
args.split = osp.basename(args.json_dir).split(".")[0] |
|
os.makedirs(args.output_dir, exist_ok=True) |
|
|
|
with open(args.json_dir, 'rb') as f: |
|
json_data = json.load(f) |
|
|
|
folder2lmdb(json_data, args.img_dir, args.mask_dir, args.output_dir, args.split) |
|
|