Spaces:
Runtime error
Runtime error
# make.texts.py | |
from __future__ import print_function | |
import os | |
import os.path as osp | |
from pycocotools.coco import COCO | |
# import gensim | |
# from gensim.models import Doc2Vec | |
import numpy as np | |
import scipy.io as sio | |
import os | |
import os.path as osp | |
from pycocotools.coco import COCO | |
import pprint | |
import os | |
import os.path as osp | |
import json | |
from nltk.tokenize import RegexpTokenizer | |
from tqdm import tqdm | |
"""process texts | |
python 2 needed by `jhlau/doc2vec`, and COCO api CAN work with python 2.7. | |
So I choose to create a virtual env of python 2.7. | |
dependencies: | |
matplotlib (COCO api) | |
smart_open (gensim) | |
""" | |
# COCO 原本的 annotations 中就有各 classes 的 ID,但不连续(从 1 标到 90 但实际只有 80 个)。这里按原有的 category id 的升序重新定义连续的、0-based 的 class ID。 | |
# train 和 val 都包含所有类,所以这里只用 val set 处理。 | |
# 结果写入 class-name.COCO.txt | |
def remake_classname(): | |
"""process class order | |
Record the mapping between tightened/discretized 0-base class ID, | |
original class ID and class name in `class-name.COCO.txt`, | |
with format `<new ID> <original ID> <class name>`. | |
The class order is consistent to the ascending order of the original IDs. | |
""" | |
COCO_P = "/dataset/coco" | |
ANNO_P = osp.join(COCO_P, "annotations") | |
SPLIT = ["val", "train"] | |
for _split in SPLIT: | |
print("---", _split, "---") | |
anno_file = osp.join(ANNO_P, "instances_{}2017.json".format(_split)) | |
coco = COCO(anno_file) | |
cats = coco.loadCats(coco.getCatIds()) | |
# print(cats[0]) | |
cls_id = {c["name"]: c["id"] for c in cats} # 它本身就是按 category id 升序 | |
# pprint.pprint(cls_id) | |
with open("class-name.COCO.txt", "w") as f: | |
for new_id, c in enumerate(cls_id): | |
old_id = cls_id[c]# - 1 | |
cn = c.replace(" ", "_") | |
# format: <new ID> <original ID> <class name> | |
f.write("{} {} {}\n".format(new_id, old_id, cn)) | |
break # 只用 val set | |
def remake_idmap(): | |
# 合并 train、val 两个集合,统一按原本的 id(即 images 文件名中的数字,也是不连续的,且 train、val 无重合)升序重新排 0-based 的 data ID。 | |
# 结果写入 id-map.COCO.txt | |
# make.id-map.py | |
"""discretization of the original file ID | |
Map the file ID to sequential {0, 1, ..., n}, | |
and record this mapping in `id-map.txt`, | |
with format `<new id> <original id> <image file name>`. | |
Note that the new ids are 0-base. | |
""" | |
TRAIN_P = "train2017" | |
VAL_P = "val2017" | |
file_list = [f for f in os.listdir(os.path.join("/dataset/coco", TRAIN_P)) if (".jpg" in f)] | |
file_list.extend([f for f in os.listdir(os.path.join("/dataset/coco", VAL_P)) if (".jpg" in f)]) | |
print("#data:", len(file_list)) # 12,3287 | |
id_key = lambda x: int(x.split(".jpg")[0]) | |
file_list = sorted(file_list, key=id_key) # 按 image ID 升序 | |
# print(file_list[:15]) | |
with open("id-map.COCO.txt", "w") as f: | |
# format: <new id> <original id> <image file name> | |
for i, f_name in enumerate(file_list): | |
_original_id = id_key(f_name) | |
f.write("{} {} {}\n".format(i, _original_id, f_name)) | |
# if i > 5: break | |
print("DONE") | |
# COCO | |
COCO_P = "/dataset/coco" | |
ANNO_P = osp.join(COCO_P, "annotations") | |
SPLIT = ["val", "train"] | |
# doc2vec | |
MODEL = "/home/dataset/Doc2Vec/enwiki_dbow/doc2vec.bin" | |
start_alpha = 0.01 | |
infer_epoch = 1000 | |
DIM = 300 # dimension of the doc2vec feature | |
# id_map_data = {} | |
# with open("id-map.txt", "r") as f: | |
# for line in f: | |
# line = line.strip() | |
# _new_id, _old_id, _ = line.split() | |
# id_map_data[int(_old_id)] = int(_new_id) | |
# N_DATA = len(id_map_data) | |
# print("#data:", N_DATA) | |
# pre-trained Doc2Vec model | |
# model = Doc2Vec.load(MODEL) | |
tokenizer = RegexpTokenizer(r'\w+') | |
def dataset_format(filepath, filename, imgid, split, sentences, cocoid): | |
data = {} | |
data['filepath'] = filepath | |
data['sentids'] = [imgid * 5 + idx for idx in range(5)] | |
data['filename'] = filename | |
data['imgid'] = imgid | |
data['split'] = split | |
data['sentences'] = [{'tokens': tokenizer.tokenize(sentence), | |
'raw': sentence, | |
'imgid': imgid, | |
'sentid': imgid * 5 + idx} | |
for idx, sentence in enumerate(sentences)] | |
data['cocoid'] = cocoid | |
return data | |
dataset_anns = {} | |
dataset_anns['images'] = [] | |
dataset_anns['dataset'] = 'coco' | |
for __split in SPLIT: | |
print("---", __split, "---") | |
anno_file = osp.join(ANNO_P, "instances_{}2017.json".format(__split)) | |
caps_file = osp.join(ANNO_P, "captions_{}2017.json".format(__split)) | |
coco = COCO(anno_file) | |
coco_caps = COCO(caps_file) | |
new_image_id_file = open("id-map.COCO.txt", 'r') | |
new_img_id_map = {image_id.strip().split(" ")[2]: image_id.strip().split(" ")[0] for image_id in new_image_id_file.readlines()} | |
id_list = coco.getImgIds() | |
for _old_id in tqdm(id_list): | |
# _new_id = id_map_data[_old_id] | |
_annIds = coco_caps.getAnnIds(imgIds=_old_id) | |
_anns = coco_caps.loadAnns(_annIds) | |
_filepath = __split + '2017' | |
_filename = coco.imgs[_old_id]['file_name'] | |
_imgid = int(new_img_id_map[_filename]) | |
_split = __split | |
# print(len(anns)) | |
# pprint.pprint(anns) | |
_sentences = [_a["caption"] for _a in _anns] | |
_cocoid = _old_id | |
formated_data = dataset_format(_filepath, _filename, _imgid, _split, _sentences, _cocoid) | |
dataset_anns['images'].append(formated_data) | |
# pprint.pprint(sentences) | |
# sentences = [gensim.utils.simple_preprocess(s) for s in sentences] | |
# pprint.pprint(sentences) | |
# doc = [] | |
# for s in sentences: | |
# doc.extend(s) | |
# print(doc) | |
# vec = model.infer_vector(doc) | |
# print(vec.shape) | |
# texts.append(vec[np.newaxis, :]) | |
# break | |
# break | |
with open('dataset_anns.json', 'w') as fp: | |
json.dump(dataset_anns, fp) | |
new_image_id_file.close() | |
# texts = np.vstack(texts).astype(np.float32) | |
# print("texts:", texts.shape, texts.dtype) # (123287, 300) dtype('<f4') | |
# sio.savemat("texts.COCO.doc2vec.{}.mat".format(DIM), {"texts": texts}) | |