atticus's picture
completed
30a0ec5
raw
history blame
No virus
6.39 kB
# make.texts.py
from __future__ import print_function
import os
import os.path as osp
from pycocotools.coco import COCO
# import gensim
# from gensim.models import Doc2Vec
import numpy as np
import scipy.io as sio
import os
import os.path as osp
from pycocotools.coco import COCO
import pprint
import os
import os.path as osp
import json
from nltk.tokenize import RegexpTokenizer
from tqdm import tqdm
"""process texts
python 2 needed by `jhlau/doc2vec`, and COCO api CAN work with python 2.7.
So I choose to create a virtual env of python 2.7.
dependencies:
matplotlib (COCO api)
smart_open (gensim)
"""
# COCO 原本的 annotations 中就有各 classes 的 ID,但不连续(从 1 标到 90 但实际只有 80 个)。这里按原有的 category id 的升序重新定义连续的、0-based 的 class ID。
# train 和 val 都包含所有类,所以这里只用 val set 处理。
# 结果写入 class-name.COCO.txt
def remake_classname():
"""process class order
Record the mapping between tightened/discretized 0-base class ID,
original class ID and class name in `class-name.COCO.txt`,
with format `<new ID> <original ID> <class name>`.
The class order is consistent to the ascending order of the original IDs.
"""
COCO_P = "/dataset/coco"
ANNO_P = osp.join(COCO_P, "annotations")
SPLIT = ["val", "train"]
for _split in SPLIT:
print("---", _split, "---")
anno_file = osp.join(ANNO_P, "instances_{}2017.json".format(_split))
coco = COCO(anno_file)
cats = coco.loadCats(coco.getCatIds())
# print(cats[0])
cls_id = {c["name"]: c["id"] for c in cats} # 它本身就是按 category id 升序
# pprint.pprint(cls_id)
with open("class-name.COCO.txt", "w") as f:
for new_id, c in enumerate(cls_id):
old_id = cls_id[c]# - 1
cn = c.replace(" ", "_")
# format: <new ID> <original ID> <class name>
f.write("{} {} {}\n".format(new_id, old_id, cn))
break # 只用 val set
def remake_idmap():
# 合并 train、val 两个集合,统一按原本的 id(即 images 文件名中的数字,也是不连续的,且 train、val 无重合)升序重新排 0-based 的 data ID。
# 结果写入 id-map.COCO.txt
# make.id-map.py
"""discretization of the original file ID
Map the file ID to sequential {0, 1, ..., n},
and record this mapping in `id-map.txt`,
with format `<new id> <original id> <image file name>`.
Note that the new ids are 0-base.
"""
TRAIN_P = "train2017"
VAL_P = "val2017"
file_list = [f for f in os.listdir(os.path.join("/dataset/coco", TRAIN_P)) if (".jpg" in f)]
file_list.extend([f for f in os.listdir(os.path.join("/dataset/coco", VAL_P)) if (".jpg" in f)])
print("#data:", len(file_list)) # 12,3287
id_key = lambda x: int(x.split(".jpg")[0])
file_list = sorted(file_list, key=id_key) # 按 image ID 升序
# print(file_list[:15])
with open("id-map.COCO.txt", "w") as f:
# format: <new id> <original id> <image file name>
for i, f_name in enumerate(file_list):
_original_id = id_key(f_name)
f.write("{} {} {}\n".format(i, _original_id, f_name))
# if i > 5: break
print("DONE")
# COCO
COCO_P = "/dataset/coco"
ANNO_P = osp.join(COCO_P, "annotations")
SPLIT = ["val", "train"]
# doc2vec
MODEL = "/home/dataset/Doc2Vec/enwiki_dbow/doc2vec.bin"
start_alpha = 0.01
infer_epoch = 1000
DIM = 300 # dimension of the doc2vec feature
# id_map_data = {}
# with open("id-map.txt", "r") as f:
# for line in f:
# line = line.strip()
# _new_id, _old_id, _ = line.split()
# id_map_data[int(_old_id)] = int(_new_id)
# N_DATA = len(id_map_data)
# print("#data:", N_DATA)
# pre-trained Doc2Vec model
# model = Doc2Vec.load(MODEL)
tokenizer = RegexpTokenizer(r'\w+')
def dataset_format(filepath, filename, imgid, split, sentences, cocoid):
data = {}
data['filepath'] = filepath
data['sentids'] = [imgid * 5 + idx for idx in range(5)]
data['filename'] = filename
data['imgid'] = imgid
data['split'] = split
data['sentences'] = [{'tokens': tokenizer.tokenize(sentence),
'raw': sentence,
'imgid': imgid,
'sentid': imgid * 5 + idx}
for idx, sentence in enumerate(sentences)]
data['cocoid'] = cocoid
return data
dataset_anns = {}
dataset_anns['images'] = []
dataset_anns['dataset'] = 'coco'
for __split in SPLIT:
print("---", __split, "---")
anno_file = osp.join(ANNO_P, "instances_{}2017.json".format(__split))
caps_file = osp.join(ANNO_P, "captions_{}2017.json".format(__split))
coco = COCO(anno_file)
coco_caps = COCO(caps_file)
new_image_id_file = open("id-map.COCO.txt", 'r')
new_img_id_map = {image_id.strip().split(" ")[2]: image_id.strip().split(" ")[0] for image_id in new_image_id_file.readlines()}
id_list = coco.getImgIds()
for _old_id in tqdm(id_list):
# _new_id = id_map_data[_old_id]
_annIds = coco_caps.getAnnIds(imgIds=_old_id)
_anns = coco_caps.loadAnns(_annIds)
_filepath = __split + '2017'
_filename = coco.imgs[_old_id]['file_name']
_imgid = int(new_img_id_map[_filename])
_split = __split
# print(len(anns))
# pprint.pprint(anns)
_sentences = [_a["caption"] for _a in _anns]
_cocoid = _old_id
formated_data = dataset_format(_filepath, _filename, _imgid, _split, _sentences, _cocoid)
dataset_anns['images'].append(formated_data)
# pprint.pprint(sentences)
# sentences = [gensim.utils.simple_preprocess(s) for s in sentences]
# pprint.pprint(sentences)
# doc = []
# for s in sentences:
# doc.extend(s)
# print(doc)
# vec = model.infer_vector(doc)
# print(vec.shape)
# texts.append(vec[np.newaxis, :])
# break
# break
with open('dataset_anns.json', 'w') as fp:
json.dump(dataset_anns, fp)
new_image_id_file.close()
# texts = np.vstack(texts).astype(np.float32)
# print("texts:", texts.shape, texts.dtype) # (123287, 300) dtype('<f4')
# sio.savemat("texts.COCO.doc2vec.{}.mat".format(DIM), {"texts": texts})