File size: 3,196 Bytes
2c58401
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import argparse
import os
import os.path as osp
import lmdb
import pyarrow as pa
import json
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")


def loads_pyarrow(buf):
    """
    Args:
        buf: the output of `dumps`.
    """
    return pa.deserialize(buf)


def raw_reader(path):
    with open(path, 'rb') as f:
        bin_data = f.read()
    return bin_data


def dumps_pyarrow(obj):
    """
    Serialize an object.
    Returns:
        Implementation-dependent bytes-like object
    """
    return pa.serialize(obj).to_buffer()


def folder2lmdb(json_data, img_dir, mask_dir, output_dir, split, write_frequency=1000):
    lmdb_path = osp.join(output_dir, "%s.lmdb" % split)
    isdir = os.path.isdir(lmdb_path)

    print("Generate LMDB to %s" % lmdb_path)
    db = lmdb.open(lmdb_path, subdir=isdir,
                   map_size=1099511627776 * 2, readonly=False,
                   meminit=False, map_async=True)

    txn = db.begin(write=True)
    tbar = tqdm(json_data)
    for idx, item in enumerate(tbar):
        img = raw_reader(osp.join(img_dir, item['img_name']))
        mask = raw_reader(osp.join(mask_dir, f"{item['segment_id']}.png"))
        data = {'img': img, 'mask': mask, 'cat': item['cat'],
                'seg_id': item['segment_id'], 'img_name': item['img_name'],
                'num_sents': item['sentences_num'], 'sents': [i['sent'] for i in item['sentences']]}
        txn.put(u'{}'.format(idx).encode('ascii'), dumps_pyarrow(data))
        if idx % write_frequency == 0:
            # print("[%d/%d]" % (idx, len(data_loader)))
            txn.commit()
            txn = db.begin(write=True)

    # finish iterating through dataset
    txn.commit()
    keys = [u'{}'.format(k).encode('ascii') for k in range(idx + 1)]
    with db.begin(write=True) as txn:
        txn.put(b'__keys__', dumps_pyarrow(keys))
        txn.put(b'__len__', dumps_pyarrow(len(keys)))

    print("Flushing database ...")
    db.sync()
    db.close()


def parse_args():
    parser = argparse.ArgumentParser(description='COCO Folder to LMDB.')
    parser.add_argument('-j', '--json-dir', type=str,
                        default='',
                        help='the name of json file.')
    parser.add_argument('-i', '--img-dir', type=str,
                        default='refcoco+',
                        help='the folder of images.')
    parser.add_argument('-m', '--mask-dir', type=str,
                        default='refcoco+',
                        help='the folder of masks.')
    parser.add_argument('-o', '--output-dir', type=str,
                        default='refcoco+',
                        help='the folder of output lmdb file.')
    parser.add_argument('-s', '--split', type=str,
                        default='train',
                        help='the split type.')
    args = parser.parse_args()
    return args


if __name__ == '__main__':
    args = parse_args()
    args.split = osp.basename(args.json_dir).split(".")[0]
    os.makedirs(args.output_dir, exist_ok=True)

    with open(args.json_dir, 'rb') as f:
        json_data = json.load(f)

    folder2lmdb(json_data, args.img_dir, args.mask_dir, args.output_dir, args.split)