Spaces:

atticus
/

image-text-retrival-huster

Build error

File size: 6,392 Bytes

30a0ec5

# make.texts.py
from __future__ import print_function
import os
import os.path as osp
from pycocotools.coco import COCO
# import gensim
# from gensim.models import Doc2Vec
import numpy as np
import scipy.io as sio
import os
import os.path as osp
from pycocotools.coco import COCO
import pprint
import os
import os.path as osp
import json 
from nltk.tokenize import RegexpTokenizer
from tqdm import tqdm 

"""process texts
python 2 needed by `jhlau/doc2vec`, and COCO api CAN work with python 2.7.
So I choose to create a virtual env of python 2.7.

dependencies:
    matplotlib (COCO api)
    smart_open (gensim)
"""

# COCO 原本的 annotations 中就有各 classes 的 ID，但不连续（从 1 标到 90 但实际只有 80 个）。这里按原有的 category id 的升序重新定义连续的、0-based 的 class ID。
# train 和 val 都包含所有类，所以这里只用 val set 处理。
# 结果写入 class-name.COCO.txt

def remake_classname(): 
    """process class order
    Record the mapping between tightened/discretized 0-base class ID,
    original class ID and class name in `class-name.COCO.txt`,
    with format `<new ID> <original ID> <class name>`.

    The class order is consistent to the ascending order of the original IDs.
    """

    COCO_P = "/dataset/coco"
    ANNO_P = osp.join(COCO_P, "annotations")
    SPLIT = ["val", "train"]

    for _split in SPLIT:
        print("---", _split, "---")
        anno_file = osp.join(ANNO_P, "instances_{}2017.json".format(_split))
        coco = COCO(anno_file)
        cats = coco.loadCats(coco.getCatIds())
        # print(cats[0])
        cls_id = {c["name"]: c["id"] for c in cats}  # 它本身就是按 category id 升序
        # pprint.pprint(cls_id)
        with open("class-name.COCO.txt", "w") as f:
            for new_id, c in enumerate(cls_id):
                old_id = cls_id[c]# - 1
                cn = c.replace(" ", "_")
                # format: <new ID> <original ID> <class name>
                f.write("{} {} {}\n".format(new_id, old_id, cn))

        break  # 只用 val set

def remake_idmap(): 
    # 合并 train、val 两个集合，统一按原本的 id（即 images 文件名中的数字，也是不连续的，且 train、val 无重合）升序重新排 0-based 的 data ID。
    # 结果写入 id-map.COCO.txt
    # make.id-map.py
    """discretization of the original file ID
    Map the file ID to sequential {0, 1, ..., n},
    and record this mapping in `id-map.txt`,
    with format `<new id> <original id> <image file name>`.

    Note that the new ids are 0-base.
    """

    TRAIN_P = "train2017"
    VAL_P = "val2017"

    file_list = [f for f in os.listdir(os.path.join("/dataset/coco", TRAIN_P)) if (".jpg" in f)]
    file_list.extend([f for f in os.listdir(os.path.join("/dataset/coco", VAL_P)) if (".jpg" in f)])
    print("#data:", len(file_list))  # 12,3287

    id_key = lambda x: int(x.split(".jpg")[0])
    file_list = sorted(file_list, key=id_key)  # 按 image ID 升序
    # print(file_list[:15])

    with open("id-map.COCO.txt", "w") as f:
        # format: <new id> <original id> <image file name>
        for i, f_name in enumerate(file_list):
            _original_id = id_key(f_name)
            f.write("{} {} {}\n".format(i, _original_id, f_name))
            # if i > 5: break
    print("DONE")


# COCO
COCO_P = "/dataset/coco"
ANNO_P = osp.join(COCO_P, "annotations")
SPLIT = ["val", "train"]
# doc2vec
MODEL = "/home/dataset/Doc2Vec/enwiki_dbow/doc2vec.bin"
start_alpha = 0.01
infer_epoch = 1000
DIM = 300  # dimension of the doc2vec feature
# id_map_data = {}
# with open("id-map.txt", "r") as f:
#     for line in f:
#         line = line.strip()
#         _new_id, _old_id, _ = line.split()
#         id_map_data[int(_old_id)] = int(_new_id)
# N_DATA = len(id_map_data)
# print("#data:", N_DATA)

# pre-trained Doc2Vec model
# model = Doc2Vec.load(MODEL)
tokenizer = RegexpTokenizer(r'\w+')
def dataset_format(filepath, filename, imgid, split, sentences, cocoid): 
    data = {} 
    data['filepath'] = filepath 
    data['sentids'] = [imgid * 5 + idx for idx in range(5)]
    data['filename'] = filename
    data['imgid'] = imgid 
    data['split'] = split 
    data['sentences'] = [{'tokens': tokenizer.tokenize(sentence), 
                            'raw': sentence, 
                            'imgid': imgid, 
                            'sentid': imgid * 5 + idx} 
                        for idx, sentence in enumerate(sentences)] 
    data['cocoid'] = cocoid 
    return data 

dataset_anns = {}
dataset_anns['images'] = []
dataset_anns['dataset'] = 'coco'
for __split in SPLIT:
    print("---", __split, "---")
    anno_file = osp.join(ANNO_P, "instances_{}2017.json".format(__split))
    caps_file = osp.join(ANNO_P, "captions_{}2017.json".format(__split))
    coco = COCO(anno_file)
    coco_caps = COCO(caps_file)
    new_image_id_file = open("id-map.COCO.txt", 'r')
    new_img_id_map = {image_id.strip().split(" ")[2]: image_id.strip().split(" ")[0] for image_id in new_image_id_file.readlines()}
    id_list = coco.getImgIds()
    for _old_id in tqdm(id_list):
        # _new_id = id_map_data[_old_id]
        _annIds = coco_caps.getAnnIds(imgIds=_old_id)
        _anns = coco_caps.loadAnns(_annIds)

        _filepath = __split + '2017'
        _filename = coco.imgs[_old_id]['file_name'] 
        _imgid = int(new_img_id_map[_filename])
        _split = __split
        # print(len(anns))
        # pprint.pprint(anns)
        _sentences = [_a["caption"] for _a in _anns]
        _cocoid = _old_id
        formated_data = dataset_format(_filepath, _filename, _imgid, _split, _sentences, _cocoid)
        dataset_anns['images'].append(formated_data)
        # pprint.pprint(sentences)
        # sentences = [gensim.utils.simple_preprocess(s) for s in sentences]
        # pprint.pprint(sentences)
        # doc = []
        # for s in sentences:
        #     doc.extend(s)
        # print(doc)
        # vec = model.infer_vector(doc)
        # print(vec.shape)
        # texts.append(vec[np.newaxis, :])
        # break
    # break

with open('dataset_anns.json', 'w') as fp:
    json.dump(dataset_anns, fp)

new_image_id_file.close()

# texts = np.vstack(texts).astype(np.float32)
# print("texts:", texts.shape, texts.dtype)  # (123287, 300) dtype('<f4')
# sio.savemat("texts.COCO.doc2vec.{}.mat".format(DIM), {"texts": texts})