Spaces:

atticus
/

image-text-retrival-huster

Runtime error

App Files Files Community

image-text-retrival-huster / scripts /dataset.py

atticus

completed

30a0ec5 over 2 years ago

raw

history blame

No virus

6.39 kB

	# make.texts.py
	from __future__ import print_function
	import os
	import os.path as osp
	from pycocotools.coco import COCO
	# import gensim
	# from gensim.models import Doc2Vec
	import numpy as np
	import scipy.io as sio
	import os
	import os.path as osp
	from pycocotools.coco import COCO
	import pprint
	import os
	import os.path as osp
	import json
	from nltk.tokenize import RegexpTokenizer
	from tqdm import tqdm

	"""process texts
	python 2 needed by `jhlau/doc2vec`, and COCO api CAN work with python 2.7.
	So I choose to create a virtual env of python 2.7.

	dependencies:
	matplotlib (COCO api)
	smart_open (gensim)
	"""

	# COCO 原本的 annotations 中就有各 classes 的 ID，但不连续（从 1 标到 90 但实际只有 80 个）。这里按原有的 category id 的升序重新定义连续的、0-based 的 class ID。
	# train 和 val 都包含所有类，所以这里只用 val set 处理。
	# 结果写入 class-name.COCO.txt

	def remake_classname():
	"""process class order
	Record the mapping between tightened/discretized 0-base class ID,
	original class ID and class name in `class-name.COCO.txt`,
	with format `<new ID> <original ID> <class name>`.

	The class order is consistent to the ascending order of the original IDs.
	"""

	COCO_P = "/dataset/coco"
	ANNO_P = osp.join(COCO_P, "annotations")
	SPLIT = ["val", "train"]

	for _split in SPLIT:
	print("---", _split, "---")
	anno_file = osp.join(ANNO_P, "instances_{}2017.json".format(_split))
	coco = COCO(anno_file)
	cats = coco.loadCats(coco.getCatIds())
	# print(cats[0])
	cls_id = {c["name"]: c["id"] for c in cats} # 它本身就是按 category id 升序
	# pprint.pprint(cls_id)
	with open("class-name.COCO.txt", "w") as f:
	for new_id, c in enumerate(cls_id):
	old_id = cls_id[c]# - 1
	cn = c.replace(" ", "_")
	# format: <new ID> <original ID> <class name>
	f.write("{} {} {}\n".format(new_id, old_id, cn))

	break # 只用 val set

	def remake_idmap():
	# 合并 train、val 两个集合，统一按原本的 id（即 images 文件名中的数字，也是不连续的，且 train、val 无重合）升序重新排 0-based 的 data ID。
	# 结果写入 id-map.COCO.txt
	# make.id-map.py
	"""discretization of the original file ID
	Map the file ID to sequential {0, 1, ..., n},
	and record this mapping in `id-map.txt`,
	with format `<new id> <original id> <image file name>`.

	Note that the new ids are 0-base.
	"""

	TRAIN_P = "train2017"
	VAL_P = "val2017"

	file_list = [f for f in os.listdir(os.path.join("/dataset/coco", TRAIN_P)) if (".jpg" in f)]
	file_list.extend([f for f in os.listdir(os.path.join("/dataset/coco", VAL_P)) if (".jpg" in f)])
	print("#data:", len(file_list)) # 12,3287

	id_key = lambda x: int(x.split(".jpg")[0])
	file_list = sorted(file_list, key=id_key) # 按 image ID 升序
	# print(file_list[:15])

	with open("id-map.COCO.txt", "w") as f:
	# format: <new id> <original id> <image file name>
	for i, f_name in enumerate(file_list):
	_original_id = id_key(f_name)
	f.write("{} {} {}\n".format(i, _original_id, f_name))
	# if i > 5: break
	print("DONE")


	# COCO
	COCO_P = "/dataset/coco"
	ANNO_P = osp.join(COCO_P, "annotations")
	SPLIT = ["val", "train"]
	# doc2vec
	MODEL = "/home/dataset/Doc2Vec/enwiki_dbow/doc2vec.bin"
	start_alpha = 0.01
	infer_epoch = 1000
	DIM = 300 # dimension of the doc2vec feature
	# id_map_data = {}
	# with open("id-map.txt", "r") as f:
	# for line in f:
	# line = line.strip()
	# _new_id, _old_id, _ = line.split()
	# id_map_data[int(_old_id)] = int(_new_id)
	# N_DATA = len(id_map_data)
	# print("#data:", N_DATA)

	# pre-trained Doc2Vec model
	# model = Doc2Vec.load(MODEL)
	tokenizer = RegexpTokenizer(r'\w+')
	def dataset_format(filepath, filename, imgid, split, sentences, cocoid):
	data = {}
	data['filepath'] = filepath
	data['sentids'] = [imgid * 5 + idx for idx in range(5)]
	data['filename'] = filename
	data['imgid'] = imgid
	data['split'] = split
	data['sentences'] = [{'tokens': tokenizer.tokenize(sentence),
	'raw': sentence,
	'imgid': imgid,
	'sentid': imgid * 5 + idx}
	for idx, sentence in enumerate(sentences)]
	data['cocoid'] = cocoid
	return data

	dataset_anns = {}
	dataset_anns['images'] = []
	dataset_anns['dataset'] = 'coco'
	for __split in SPLIT:
	print("---", __split, "---")
	anno_file = osp.join(ANNO_P, "instances_{}2017.json".format(__split))
	caps_file = osp.join(ANNO_P, "captions_{}2017.json".format(__split))
	coco = COCO(anno_file)
	coco_caps = COCO(caps_file)
	new_image_id_file = open("id-map.COCO.txt", 'r')
	new_img_id_map = {image_id.strip().split(" ")[2]: image_id.strip().split(" ")[0] for image_id in new_image_id_file.readlines()}
	id_list = coco.getImgIds()
	for _old_id in tqdm(id_list):
	# _new_id = id_map_data[_old_id]
	_annIds = coco_caps.getAnnIds(imgIds=_old_id)
	_anns = coco_caps.loadAnns(_annIds)

	_filepath = __split + '2017'
	_filename = coco.imgs[_old_id]['file_name']
	_imgid = int(new_img_id_map[_filename])
	_split = __split
	# print(len(anns))
	# pprint.pprint(anns)
	_sentences = [_a["caption"] for _a in _anns]
	_cocoid = _old_id
	formated_data = dataset_format(_filepath, _filename, _imgid, _split, _sentences, _cocoid)
	dataset_anns['images'].append(formated_data)
	# pprint.pprint(sentences)
	# sentences = [gensim.utils.simple_preprocess(s) for s in sentences]
	# pprint.pprint(sentences)
	# doc = []
	# for s in sentences:
	# doc.extend(s)
	# print(doc)
	# vec = model.infer_vector(doc)
	# print(vec.shape)
	# texts.append(vec[np.newaxis, :])
	# break
	# break

	with open('dataset_anns.json', 'w') as fp:
	json.dump(dataset_anns, fp)

	new_image_id_file.close()

	# texts = np.vstack(texts).astype(np.float32)
	# print("texts:", texts.shape, texts.dtype) # (123287, 300) dtype('<f4')
	# sio.savemat("texts.COCO.doc2vec.{}.mat".format(DIM), {"texts": texts})