# make.texts.py from __future__ import print_function import os import os.path as osp from pycocotools.coco import COCO # import gensim # from gensim.models import Doc2Vec import numpy as np import scipy.io as sio import os import os.path as osp from pycocotools.coco import COCO import pprint import os import os.path as osp import json from nltk.tokenize import RegexpTokenizer from tqdm import tqdm """process texts python 2 needed by `jhlau/doc2vec`, and COCO api CAN work with python 2.7. So I choose to create a virtual env of python 2.7. dependencies: matplotlib (COCO api) smart_open (gensim) """ # COCO 原本的 annotations 中就有各 classes 的 ID,但不连续(从 1 标到 90 但实际只有 80 个)。这里按原有的 category id 的升序重新定义连续的、0-based 的 class ID。 # train 和 val 都包含所有类,所以这里只用 val set 处理。 # 结果写入 class-name.COCO.txt def remake_classname(): """process class order Record the mapping between tightened/discretized 0-base class ID, original class ID and class name in `class-name.COCO.txt`, with format ` `. The class order is consistent to the ascending order of the original IDs. """ COCO_P = "/dataset/coco" ANNO_P = osp.join(COCO_P, "annotations") SPLIT = ["val", "train"] for _split in SPLIT: print("---", _split, "---") anno_file = osp.join(ANNO_P, "instances_{}2017.json".format(_split)) coco = COCO(anno_file) cats = coco.loadCats(coco.getCatIds()) # print(cats[0]) cls_id = {c["name"]: c["id"] for c in cats} # 它本身就是按 category id 升序 # pprint.pprint(cls_id) with open("class-name.COCO.txt", "w") as f: for new_id, c in enumerate(cls_id): old_id = cls_id[c]# - 1 cn = c.replace(" ", "_") # format: f.write("{} {} {}\n".format(new_id, old_id, cn)) break # 只用 val set def remake_idmap(): # 合并 train、val 两个集合,统一按原本的 id(即 images 文件名中的数字,也是不连续的,且 train、val 无重合)升序重新排 0-based 的 data ID。 # 结果写入 id-map.COCO.txt # make.id-map.py """discretization of the original file ID Map the file ID to sequential {0, 1, ..., n}, and record this mapping in `id-map.txt`, with format ` `. Note that the new ids are 0-base. """ TRAIN_P = "train2017" VAL_P = "val2017" file_list = [f for f in os.listdir(os.path.join("/dataset/coco", TRAIN_P)) if (".jpg" in f)] file_list.extend([f for f in os.listdir(os.path.join("/dataset/coco", VAL_P)) if (".jpg" in f)]) print("#data:", len(file_list)) # 12,3287 id_key = lambda x: int(x.split(".jpg")[0]) file_list = sorted(file_list, key=id_key) # 按 image ID 升序 # print(file_list[:15]) with open("id-map.COCO.txt", "w") as f: # format: for i, f_name in enumerate(file_list): _original_id = id_key(f_name) f.write("{} {} {}\n".format(i, _original_id, f_name)) # if i > 5: break print("DONE") # COCO COCO_P = "/dataset/coco" ANNO_P = osp.join(COCO_P, "annotations") SPLIT = ["val", "train"] # doc2vec MODEL = "/home/dataset/Doc2Vec/enwiki_dbow/doc2vec.bin" start_alpha = 0.01 infer_epoch = 1000 DIM = 300 # dimension of the doc2vec feature # id_map_data = {} # with open("id-map.txt", "r") as f: # for line in f: # line = line.strip() # _new_id, _old_id, _ = line.split() # id_map_data[int(_old_id)] = int(_new_id) # N_DATA = len(id_map_data) # print("#data:", N_DATA) # pre-trained Doc2Vec model # model = Doc2Vec.load(MODEL) tokenizer = RegexpTokenizer(r'\w+') def dataset_format(filepath, filename, imgid, split, sentences, cocoid): data = {} data['filepath'] = filepath data['sentids'] = [imgid * 5 + idx for idx in range(5)] data['filename'] = filename data['imgid'] = imgid data['split'] = split data['sentences'] = [{'tokens': tokenizer.tokenize(sentence), 'raw': sentence, 'imgid': imgid, 'sentid': imgid * 5 + idx} for idx, sentence in enumerate(sentences)] data['cocoid'] = cocoid return data dataset_anns = {} dataset_anns['images'] = [] dataset_anns['dataset'] = 'coco' for __split in SPLIT: print("---", __split, "---") anno_file = osp.join(ANNO_P, "instances_{}2017.json".format(__split)) caps_file = osp.join(ANNO_P, "captions_{}2017.json".format(__split)) coco = COCO(anno_file) coco_caps = COCO(caps_file) new_image_id_file = open("id-map.COCO.txt", 'r') new_img_id_map = {image_id.strip().split(" ")[2]: image_id.strip().split(" ")[0] for image_id in new_image_id_file.readlines()} id_list = coco.getImgIds() for _old_id in tqdm(id_list): # _new_id = id_map_data[_old_id] _annIds = coco_caps.getAnnIds(imgIds=_old_id) _anns = coco_caps.loadAnns(_annIds) _filepath = __split + '2017' _filename = coco.imgs[_old_id]['file_name'] _imgid = int(new_img_id_map[_filename]) _split = __split # print(len(anns)) # pprint.pprint(anns) _sentences = [_a["caption"] for _a in _anns] _cocoid = _old_id formated_data = dataset_format(_filepath, _filename, _imgid, _split, _sentences, _cocoid) dataset_anns['images'].append(formated_data) # pprint.pprint(sentences) # sentences = [gensim.utils.simple_preprocess(s) for s in sentences] # pprint.pprint(sentences) # doc = [] # for s in sentences: # doc.extend(s) # print(doc) # vec = model.infer_vector(doc) # print(vec.shape) # texts.append(vec[np.newaxis, :]) # break # break with open('dataset_anns.json', 'w') as fp: json.dump(dataset_anns, fp) new_image_id_file.close() # texts = np.vstack(texts).astype(np.float32) # print("texts:", texts.shape, texts.dtype) # (123287, 300) dtype('