VRIS_vip / refer /folder2lmdb.py
dianecy's picture
Add files using upload-large-folder tool
2c58401 verified
raw
history blame
3.2 kB
import argparse
import os
import os.path as osp
import lmdb
import pyarrow as pa
import json
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
def loads_pyarrow(buf):
"""
Args:
buf: the output of `dumps`.
"""
return pa.deserialize(buf)
def raw_reader(path):
with open(path, 'rb') as f:
bin_data = f.read()
return bin_data
def dumps_pyarrow(obj):
"""
Serialize an object.
Returns:
Implementation-dependent bytes-like object
"""
return pa.serialize(obj).to_buffer()
def folder2lmdb(json_data, img_dir, mask_dir, output_dir, split, write_frequency=1000):
lmdb_path = osp.join(output_dir, "%s.lmdb" % split)
isdir = os.path.isdir(lmdb_path)
print("Generate LMDB to %s" % lmdb_path)
db = lmdb.open(lmdb_path, subdir=isdir,
map_size=1099511627776 * 2, readonly=False,
meminit=False, map_async=True)
txn = db.begin(write=True)
tbar = tqdm(json_data)
for idx, item in enumerate(tbar):
img = raw_reader(osp.join(img_dir, item['img_name']))
mask = raw_reader(osp.join(mask_dir, f"{item['segment_id']}.png"))
data = {'img': img, 'mask': mask, 'cat': item['cat'],
'seg_id': item['segment_id'], 'img_name': item['img_name'],
'num_sents': item['sentences_num'], 'sents': [i['sent'] for i in item['sentences']]}
txn.put(u'{}'.format(idx).encode('ascii'), dumps_pyarrow(data))
if idx % write_frequency == 0:
# print("[%d/%d]" % (idx, len(data_loader)))
txn.commit()
txn = db.begin(write=True)
# finish iterating through dataset
txn.commit()
keys = [u'{}'.format(k).encode('ascii') for k in range(idx + 1)]
with db.begin(write=True) as txn:
txn.put(b'__keys__', dumps_pyarrow(keys))
txn.put(b'__len__', dumps_pyarrow(len(keys)))
print("Flushing database ...")
db.sync()
db.close()
def parse_args():
parser = argparse.ArgumentParser(description='COCO Folder to LMDB.')
parser.add_argument('-j', '--json-dir', type=str,
default='',
help='the name of json file.')
parser.add_argument('-i', '--img-dir', type=str,
default='refcoco+',
help='the folder of images.')
parser.add_argument('-m', '--mask-dir', type=str,
default='refcoco+',
help='the folder of masks.')
parser.add_argument('-o', '--output-dir', type=str,
default='refcoco+',
help='the folder of output lmdb file.')
parser.add_argument('-s', '--split', type=str,
default='train',
help='the split type.')
args = parser.parse_args()
return args
if __name__ == '__main__':
args = parse_args()
args.split = osp.basename(args.json_dir).split(".")[0]
os.makedirs(args.output_dir, exist_ok=True)
with open(args.json_dir, 'rb') as f:
json_data = json.load(f)
folder2lmdb(json_data, args.img_dir, args.mask_dir, args.output_dir, args.split)