File size: 6,392 Bytes
30a0ec5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
# make.texts.py
from __future__ import print_function
import os
import os.path as osp
from pycocotools.coco import COCO
# import gensim
# from gensim.models import Doc2Vec
import numpy as np
import scipy.io as sio
import os
import os.path as osp
from pycocotools.coco import COCO
import pprint
import os
import os.path as osp
import json 
from nltk.tokenize import RegexpTokenizer
from tqdm import tqdm 

"""process texts
python 2 needed by `jhlau/doc2vec`, and COCO api CAN work with python 2.7.
So I choose to create a virtual env of python 2.7.

dependencies:
    matplotlib (COCO api)
    smart_open (gensim)
"""

# COCO 原本的 annotations 中就有各 classes 的 ID,但不连续(从 1 标到 90 但实际只有 80 个)。这里按原有的 category id 的升序重新定义连续的、0-based 的 class ID。
# train 和 val 都包含所有类,所以这里只用 val set 处理。
# 结果写入 class-name.COCO.txt

def remake_classname(): 
    """process class order
    Record the mapping between tightened/discretized 0-base class ID,
    original class ID and class name in `class-name.COCO.txt`,
    with format `<new ID> <original ID> <class name>`.

    The class order is consistent to the ascending order of the original IDs.
    """

    COCO_P = "/dataset/coco"
    ANNO_P = osp.join(COCO_P, "annotations")
    SPLIT = ["val", "train"]

    for _split in SPLIT:
        print("---", _split, "---")
        anno_file = osp.join(ANNO_P, "instances_{}2017.json".format(_split))
        coco = COCO(anno_file)
        cats = coco.loadCats(coco.getCatIds())
        # print(cats[0])
        cls_id = {c["name"]: c["id"] for c in cats}  # 它本身就是按 category id 升序
        # pprint.pprint(cls_id)
        with open("class-name.COCO.txt", "w") as f:
            for new_id, c in enumerate(cls_id):
                old_id = cls_id[c]# - 1
                cn = c.replace(" ", "_")
                # format: <new ID> <original ID> <class name>
                f.write("{} {} {}\n".format(new_id, old_id, cn))

        break  # 只用 val set

def remake_idmap(): 
    # 合并 train、val 两个集合,统一按原本的 id(即 images 文件名中的数字,也是不连续的,且 train、val 无重合)升序重新排 0-based 的 data ID。
    # 结果写入 id-map.COCO.txt
    # make.id-map.py
    """discretization of the original file ID
    Map the file ID to sequential {0, 1, ..., n},
    and record this mapping in `id-map.txt`,
    with format `<new id> <original id> <image file name>`.

    Note that the new ids are 0-base.
    """

    TRAIN_P = "train2017"
    VAL_P = "val2017"

    file_list = [f for f in os.listdir(os.path.join("/dataset/coco", TRAIN_P)) if (".jpg" in f)]
    file_list.extend([f for f in os.listdir(os.path.join("/dataset/coco", VAL_P)) if (".jpg" in f)])
    print("#data:", len(file_list))  # 12,3287

    id_key = lambda x: int(x.split(".jpg")[0])
    file_list = sorted(file_list, key=id_key)  # 按 image ID 升序
    # print(file_list[:15])

    with open("id-map.COCO.txt", "w") as f:
        # format: <new id> <original id> <image file name>
        for i, f_name in enumerate(file_list):
            _original_id = id_key(f_name)
            f.write("{} {} {}\n".format(i, _original_id, f_name))
            # if i > 5: break
    print("DONE")


# COCO
COCO_P = "/dataset/coco"
ANNO_P = osp.join(COCO_P, "annotations")
SPLIT = ["val", "train"]
# doc2vec
MODEL = "/home/dataset/Doc2Vec/enwiki_dbow/doc2vec.bin"
start_alpha = 0.01
infer_epoch = 1000
DIM = 300  # dimension of the doc2vec feature
# id_map_data = {}
# with open("id-map.txt", "r") as f:
#     for line in f:
#         line = line.strip()
#         _new_id, _old_id, _ = line.split()
#         id_map_data[int(_old_id)] = int(_new_id)
# N_DATA = len(id_map_data)
# print("#data:", N_DATA)

# pre-trained Doc2Vec model
# model = Doc2Vec.load(MODEL)
tokenizer = RegexpTokenizer(r'\w+')
def dataset_format(filepath, filename, imgid, split, sentences, cocoid): 
    data = {} 
    data['filepath'] = filepath 
    data['sentids'] = [imgid * 5 + idx for idx in range(5)]
    data['filename'] = filename
    data['imgid'] = imgid 
    data['split'] = split 
    data['sentences'] = [{'tokens': tokenizer.tokenize(sentence), 
                            'raw': sentence, 
                            'imgid': imgid, 
                            'sentid': imgid * 5 + idx} 
                        for idx, sentence in enumerate(sentences)] 
    data['cocoid'] = cocoid 
    return data 

dataset_anns = {}
dataset_anns['images'] = []
dataset_anns['dataset'] = 'coco'
for __split in SPLIT:
    print("---", __split, "---")
    anno_file = osp.join(ANNO_P, "instances_{}2017.json".format(__split))
    caps_file = osp.join(ANNO_P, "captions_{}2017.json".format(__split))
    coco = COCO(anno_file)
    coco_caps = COCO(caps_file)
    new_image_id_file = open("id-map.COCO.txt", 'r')
    new_img_id_map = {image_id.strip().split(" ")[2]: image_id.strip().split(" ")[0] for image_id in new_image_id_file.readlines()}
    id_list = coco.getImgIds()
    for _old_id in tqdm(id_list):
        # _new_id = id_map_data[_old_id]
        _annIds = coco_caps.getAnnIds(imgIds=_old_id)
        _anns = coco_caps.loadAnns(_annIds)

        _filepath = __split + '2017'
        _filename = coco.imgs[_old_id]['file_name'] 
        _imgid = int(new_img_id_map[_filename])
        _split = __split
        # print(len(anns))
        # pprint.pprint(anns)
        _sentences = [_a["caption"] for _a in _anns]
        _cocoid = _old_id
        formated_data = dataset_format(_filepath, _filename, _imgid, _split, _sentences, _cocoid)
        dataset_anns['images'].append(formated_data)
        # pprint.pprint(sentences)
        # sentences = [gensim.utils.simple_preprocess(s) for s in sentences]
        # pprint.pprint(sentences)
        # doc = []
        # for s in sentences:
        #     doc.extend(s)
        # print(doc)
        # vec = model.infer_vector(doc)
        # print(vec.shape)
        # texts.append(vec[np.newaxis, :])
        # break
    # break

with open('dataset_anns.json', 'w') as fp:
    json.dump(dataset_anns, fp)

new_image_id_file.close()

# texts = np.vstack(texts).astype(np.float32)
# print("texts:", texts.shape, texts.dtype)  # (123287, 300) dtype('<f4')
# sio.savemat("texts.COCO.doc2vec.{}.mat".format(DIM), {"texts": texts})