deepspeed / src /data /data_scripts /refcoco.py

init

002bd9b 12 months ago

23.5 kB

	import json
	import os
	import pickle
	import logging

	import datasets
	import pycocotools.mask as mask
	import dotenv

	logger = logging.getLogger(__name__)


	# Add BibTeX citation
	# Find for instance the citation on arxiv or on the dataset repo/website
	_CITATION = """\
	@article{DBLP:journals/corr/LinMBHPRDZ14,
	author = {Tsung{-}Yi Lin and
	Michael Maire and
	Serge J. Belongie and
	Lubomir D. Bourdev and
	Ross B. Girshick and
	James Hays and
	Pietro Perona and
	Deva Ramanan and
	Piotr Doll{'{a} }r and
	C. Lawrence Zitnick},
	title = {Microsoft {COCO:} Common Objects in Context},
	journal = {CoRR},
	volume = {abs/1405.0312},
	year = {2014},
	url = {http://arxiv.org/abs/1405.0312},
	archivePrefix = {arXiv},
	eprint = {1405.0312},
	timestamp = {Mon, 13 Aug 2018 16:48:13 +0200},
	biburl = {https://dblp.org/rec/bib/journals/corr/LinMBHPRDZ14},
	bibsource = {dblp computer science bibliography, https://dblp.org}
	}
	"""

	# Add description of the dataset here
	# You can copy an official description
	_DESCRIPTION = """\
	COCO is a large-scale object detection, segmentation, and captioning dataset.
	"""

	# Add a link to an official homepage for the dataset here
	_HOMEPAGE = "http://cocodataset.org/#home"

	# Add the licence for the dataset here if you can find it
	_LICENSE = ""

	# Add link to the official dataset URLs here
	# The HuggingFace dataset library don't host the datasets but only point to the original files
	# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)

	# This script is supposed to work with local (downloaded) COCO dataset.
	_URLs = {}

	_BASE_REGION_FEATURES = {
	"region_id": datasets.Value("int64"),
	"image_id": datasets.Value("int32"),
	"phrases": [datasets.Value("string")],
	"x": datasets.Value("int32"),
	"y": datasets.Value("int32"),
	"width": datasets.Value("int32"),
	"height": datasets.Value("int32"),
	}

	_BASE_MASK_FEATURES = {
	"size": [datasets.Value("int32")],
	"counts": datasets.Value("string"),
	}

	_BASE_MASK_REGION_FEATURES = {
	"region_id": datasets.Value("int64"),
	"image_id": datasets.Value("int32"),
	"phrases": [datasets.Value("string")],
	"x": datasets.Value("int32"),
	"y": datasets.Value("int32"),
	"width": datasets.Value("int32"),
	"height": datasets.Value("int32"),
	"mask": _BASE_MASK_FEATURES,
	}

	_ANNOTATION_FEATURES = {
	"region_descriptions": {"regions": [_BASE_REGION_FEATURES]},
	"mask_region_descriptions": {"regions": [_BASE_MASK_REGION_FEATURES]},
	}

	_BASE_IMAGE_METADATA_FEATURES = {
	"image_id": datasets.Value("int32"),
	# "caption_id": datasets.Value("int64"),
	# "caption": datasets.Value("string"),
	"height": datasets.Value("int32"),
	"width": datasets.Value("int32"),
	"file_name": datasets.Value("string"),
	"coco_url": datasets.Value("string"),
	# "image_path": datasets.Value("string"),
	"task_type": datasets.Value("string"),
	}


	_SPLIT_BYS = {
	"refclef": ["unc", "berkeley"],
	# NOTE: use refer2 by UNC authors
	# "refcoco": ["unc", "google"],
	"refcoco": ["unc"],
	"refcoco+": ["unc"],
	"refcocog": ["umd", "google"],
	}
	_SPLITS = {
	"refclef-unc": ["train", "val", "testA", "testB", "testC"],
	"refclef-berkeley": ["train", "val", "test"],
	# **{f"refcoco-{_split_by}": ["train", "val", "test"] for _split_by in _SPLIT_BYS["refcoco"]},
	# **{f"refcoco+-{_split_by}": ["train", "val", "test"] for _split_by in _SPLIT_BYS["refcoco+"]},
	**{f"refcoco-{_split_by}": ["train", "val", "testA", "testB"] for _split_by in _SPLIT_BYS["refcoco"]},
	**{f"refcoco+-{_split_by}": ["train", "val", "testA", "testB"] for _split_by in _SPLIT_BYS["refcoco+"]},
	**{f"refcocog-{_split_by}": ["train", "val"] for _split_by in _SPLIT_BYS["refcocog"]},
	}
	datasets.Split("testA")
	datasets.Split("testB")


	class RefCOCOBuilderConfig(datasets.BuilderConfig):
	def __init__(
	self,
	name,
	splits,
	with_image=True,
	with_mask=True,
	base_url=None,
	sas_key=None,
	task_type="caption",
	**kwargs,
	):
	super().__init__(name, **kwargs)
	self.splits = splits
	self.dataset_name = name.split("-")[0]
	self.split_by = name.split("-")[-1]
	self.with_image = with_image
	self.with_mask = with_mask
	self.base_url = base_url
	self.sas_key = sas_key
	self.task_type = task_type

	@property
	def features(self):
	annoation_type = "mask_region_descriptions" if self.with_mask else "region_descriptions"
	logger.info(f"Using annotation type: {annoation_type} due to with_mask={self.with_mask}")
	return datasets.Features(
	{
	**({"image": datasets.Image()} if self.with_image else {}),
	**_BASE_IMAGE_METADATA_FEATURES,
	**_ANNOTATION_FEATURES[annoation_type],
	}
	)


	# Name of the dataset usually match the script name with CamelCase instead of snake_case
	class RefCOCODataset(datasets.GeneratorBasedBuilder):
	"""An example dataset script to work with the local (downloaded) COCO dataset"""

	VERSION = datasets.Version("0.0.0")

	BUILDER_CONFIG_CLASS = RefCOCOBuilderConfig
	BUILDER_CONFIGS = [RefCOCOBuilderConfig(name=name, splits=splits) for name, splits in _SPLITS.items()]

	DEFAULT_CONFIG_NAME = "refcoco-unc"
	config: RefCOCOBuilderConfig

	def _info(self):
	# This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset
	features = self.config.features

	return datasets.DatasetInfo(
	# This is the description that will appear on the datasets page.
	description=_DESCRIPTION,
	# This defines the different columns of the dataset and their types
	features=features, # Here we define them above because they are different between the two configurations
	# If there's a common (input, target) tuple from the features,
	# specify them here. They'll be used if as_supervised=True in
	# builder.as_dataset.
	supervised_keys=None,
	# Homepage of the dataset for documentation
	homepage=_HOMEPAGE,
	# License for the dataset if available
	license=_LICENSE,
	# Citation for the dataset
	citation=_CITATION,
	)

	def _split_generators(self, dl_manager):
	"""Returns SplitGenerators."""
	# This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
	# If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name

	# NOTE: we use base_url instead of data_dir
	# When we use data_dir, all the paths are relative to the data_dir.
	base_url = self.config.base_url
	if base_url is None:
	raise ValueError(
	"This script is supposed to work with local or remote RefCOCO dataset. It is either a local path or remote url. The argument `base_url` in `load_dataset()` is required."
	)
	logger.info(f"Using base_url: {base_url}")

	# _DL_URLS = {
	# "train": os.path.join(data_dir, "train2017.zip"),
	# "val": os.path.join(data_dir, "val2017.zip"),
	# "test": os.path.join(data_dir, "test2017.zip"),
	# "annotations_trainval": os.path.join(data_dir, "annotations_trainval2017.zip"),
	# "image_info_test": os.path.join(data_dir, "image_info_test2017.zip"),
	# }
	_DL_URLS = {}
	if self.config.dataset_name in ["refcoco", "refcoco+", "refcocog"]:
	_DL_URLS["image_dir"] = os.path.join(base_url, "train2014.zip")
	elif self.config.dataset_name == "refclef":
	_DL_URLS["image_dir"] = os.path.join(base_url, "saiapr_tc-12.zip")
	else:
	raise ValueError(f"Unknown dataset name: {self.config.dataset_name}")
	_DL_URLS["annotation_dir"] = os.path.join(base_url, f"{self.config.dataset_name}.zip")

	sas_key = self.config.sas_key
	if sas_key is None:
	# NOTE(xiaoke): load sas_key from .env
	logger.info(f"Try to load sas_key from .env file: {dotenv.load_dotenv('.env')}.")
	sas_key = os.getenv("REFCOCO_SAS_KEY")
	if sas_key is not None and not os.path.exists(base_url):
	logger.info(f"Using sas_key: {sas_key}")
	_DL_URLS = {k: f"{v}{sas_key}" for k, v in _DL_URLS.items()}

	if dl_manager.is_streaming is True:
	raise ValueError(
	"dl_manager.is_streaming is True, which is very slow due to the random access inside zip files with streaming loading."
	)

	archive_path = dl_manager.download_and_extract(_DL_URLS)

	# NOTE(xiaoke): prepare data for index generation
	with open(
	os.path.join(archive_path["annotation_dir"], self.config.dataset_name, f"refs({self.config.split_by}).p"),
	"rb",
	) as fp:
	refs = pickle.load(fp)
	with open(
	os.path.join(archive_path["annotation_dir"], self.config.dataset_name, f"instances.json"),
	"r",
	encoding="UTF-8",
	) as fp:
	instances = json.load(fp)
	self.data = {}
	self.data["dataset"] = self.config.dataset_name
	self.data["refs"] = refs
	self.data["images"] = instances["images"]
	self.data["annotations"] = instances["annotations"]
	self.data["categories"] = instances["categories"]
	self.createIndex()
	print(f"num refs: {len(self.Refs)}")

	splits = []
	for split in self.config.splits:
	if split == "train":
	dataset = datasets.SplitGenerator(
	name=datasets.Split.TRAIN,
	# These kwargs will be passed to _generate_examples
	# gen_kwargs={
	# "json_path": os.path.join(
	# archive_path["annotations_trainval"], "annotations", "captions_train2017.json"
	# ),
	# "image_dir": os.path.join(archive_path["train"], "train2017"),
	# "split": "train",
	# },
	gen_kwargs={
	"image_dir": archive_path["image_dir"],
	"split": split,
	},
	)
	elif split in ["val"]:
	dataset = datasets.SplitGenerator(
	name=datasets.Split.VALIDATION,
	# These kwargs will be passed to _generate_examples
	# gen_kwargs={
	# "json_path": os.path.join(
	# archive_path["annotations_trainval"], "annotations", "captions_val2017.json"
	# ),
	# "image_dir": os.path.join(archive_path["val"], "val2017"),
	# "split": "valid",
	# },
	gen_kwargs={
	"image_dir": archive_path["image_dir"],
	"split": split,
	},
	)
	elif split == "test":
	dataset = datasets.SplitGenerator(
	name=datasets.Split.TEST,
	# These kwargs will be passed to _generate_examples
	# gen_kwargs={
	# "json_path": os.path.join(
	# archive_path["image_info_test"], "annotations", "image_info_test2017.json"
	# ),
	# "image_dir": os.path.join(archive_path["test"], "test2017"),
	# "split": "test",
	# },
	gen_kwargs={
	"image_dir": archive_path["image_dir"],
	"split": split,
	},
	)
	elif split in ["testA", "testB", "testC"]:
	dataset = datasets.SplitGenerator(
	name=datasets.Split(split),
	# These kwargs will be passed to _generate_examples
	# gen_kwargs={
	# "json_path": os.path.join(
	# archive_path["image_info_test"], "annotations", "image_info_test2017.json"
	# ),
	# "image_dir": os.path.join(archive_path["test"], "test2017"),
	# "split": "test",
	# },
	gen_kwargs={
	"image_dir": archive_path["image_dir"],
	"split": split,
	},
	)
	else:
	raise ValueError(f"Unknown split name: {split}")

	splits.append(dataset)

	return splits

	def _generate_examples(
	# method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
	self,
	image_dir,
	split,
	):
	"""Yields examples as (key, example) tuples."""
	# This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
	# The `key` is here for legacy reason (tfds) and is not important in itself.

	ref_ids = self.getRefIds(split=split)
	img_ids = self.getImgIds(ref_ids=ref_ids)

	logger.info(f"Generating examples from {len(ref_ids)} refs and {len(img_ids)} images in split {split}...")

	if self.config.dataset_name in ["refcoco", "refcoco+", "refcocog"]:
	image_dir_name = "train2014"
	elif self.config.dataset_name == "refclef":
	image_dir_name = "saiapr_tc-12"
	else:
	raise ValueError(f"Unknown dataset name: {self.config.dataset_name}")

	for idx, img_id in enumerate(img_ids):
	img = self.Imgs[img_id]
	image_metadata = {
	"coco_url": img.get("coco_url", None),
	"file_name": img["file_name"],
	"height": img["height"],
	"width": img["width"],
	"image_id": img["id"],
	}
	image_dict = (
	{"image": os.path.join(image_dir, image_dir_name, img["file_name"])} if self.config.with_image else {}
	)

	annotation = []

	img_to_refs = self.imgToRefs[img_id]
	for img_to_ref in img_to_refs:
	ref_to_ann = self.refToAnn[img_to_ref["ref_id"]]
	x, y, width, height = ref_to_ann["bbox"]
	# NOTE: we need to convert float to int
	annotation_dict = {
	"image_id": img_to_ref["image_id"],
	"region_id": img_to_ref["ref_id"],
	"x": int(x),
	"y": int(y),
	"width": int(width),
	"height": int(height),
	}
	annotation_dict["phrases"] = [sent["sent"] for sent in img_to_ref["sentences"]]

	if self.config.with_mask:
	if type(ref_to_ann["segmentation"][0]) == list:
	rle = mask.frPyObjects(ref_to_ann["segmentation"], img["height"], img["width"])
	else:
	rle = ref_to_ann["segmentation"]
	mask_dict = rle[0] # should be a dict, rather a list
	annotation_dict["mask"] = {
	"size": mask_dict["size"],
	"counts": mask_dict["counts"].decode("utf-8"), # NOTE: otherwise, it leads to core dump error.
	}
	annotation.append(annotation_dict)
	annotation = {"regions": annotation}
	yield idx, {image_dict, image_metadata, **annotation, "task_type": self.config.task_type}

	"""
	{
	'coco_url': Value(dtype='string', id=None),
	'file_name': Value(dtype='string', id=None),
	'height': Value(dtype='int32', id=None),
	'image': Image(decode=True, id=None),
	'image_id': Value(dtype='int32', id=None),
	'regions': [{
	'height': Value(dtype='int32', id=None),
	'image_id': Value(dtype='int32', id=None),
	'mask': {
	'counts': Value(dtype='string', id=None),
	'size': [Value(dtype='int32', id=None)]
	},
	'phrases': [Value(dtype='string', id=None)],
	'region_id': Value(dtype='int32', id=None),
	'width': Value(dtype='int32', id=None),
	'x': Value(dtype='int32', id=None),
	'y': Value(dtype='int32', id=None)
	}],
	'width': Value(dtype='int32', id=None)
	}
	"""

	# _features = [
	# "image_id",
	# "caption_id",
	# "caption",
	# "height",
	# "width",
	# "file_name",
	# "coco_url",
	# "image_path",
	# "id",
	# ]
	# features = list(_features)

	# if split in "valid":
	# split = "val"

	# with open(json_path, "r", encoding="UTF-8") as fp:
	# data = json.load(fp)

	# # list of dict
	# images = data["images"]
	# entries = images

	# # build a dict of image_id -> image info dict
	# d = {image["id"]: image for image in images}

	# # list of dict
	# if split in ["train", "val"]:
	# annotations = data["annotations"]

	# # build a dict of image_id ->
	# for annotation in annotations:
	# _id = annotation["id"]
	# image_info = d[annotation["image_id"]]
	# annotation.update(image_info)
	# annotation["id"] = _id

	# entries = annotations

	# for id_, entry in enumerate(entries):
	# entry = {k: v for k, v in entry.items() if k in features}

	# if split == "test":
	# entry["image_id"] = entry["id"]
	# entry["id"] = -1
	# entry["caption"] = -1

	# entry["caption_id"] = entry.pop("id")
	# entry["image_path"] = os.path.join(image_dir, entry["file_name"])

	# entry = {k: entry[k] for k in _features if k in entry}

	# yield str((entry["image_id"], entry["caption_id"])), entry

	def createIndex(self):
	# create sets of mapping
	# 1) Refs: {ref_id: ref}
	# 2) Anns: {ann_id: ann}
	# 3) Imgs: {image_id: image}
	# 4) Cats: {category_id: category_name}
	# 5) Sents: {sent_id: sent}
	# 6) imgToRefs: {image_id: refs}
	# 7) imgToAnns: {image_id: anns}
	# 8) refToAnn: {ref_id: ann}
	# 9) annToRef: {ann_id: ref}
	# 10) catToRefs: {category_id: refs}
	# 11) sentToRef: {sent_id: ref}
	# 12) sentToTokens: {sent_id: tokens}
	logger.info(f"creating index for {self.config.name}...")
	# fetch info from instances
	Anns, Imgs, Cats, imgToAnns = {}, {}, {}, {}
	for ann in self.data["annotations"]:
	Anns[ann["id"]] = ann
	imgToAnns[ann["image_id"]] = imgToAnns.get(ann["image_id"], []) + [ann]
	for img in self.data["images"]:
	Imgs[img["id"]] = img
	for cat in self.data["categories"]:
	Cats[cat["id"]] = cat["name"]

	# fetch info from refs
	Refs, imgToRefs, refToAnn, annToRef, catToRefs = {}, {}, {}, {}, {}
	Sents, sentToRef, sentToTokens = {}, {}, {}
	for ref in self.data["refs"]:
	# ids
	ref_id = ref["ref_id"]
	ann_id = ref["ann_id"]
	category_id = ref["category_id"]
	image_id = ref["image_id"]

	# add mapping related to ref
	Refs[ref_id] = ref
	imgToRefs[image_id] = imgToRefs.get(image_id, []) + [ref]
	catToRefs[category_id] = catToRefs.get(category_id, []) + [ref]
	refToAnn[ref_id] = Anns[ann_id]
	annToRef[ann_id] = ref

	# add mapping of sent
	for sent in ref["sentences"]:
	Sents[sent["sent_id"]] = sent
	sentToRef[sent["sent_id"]] = ref
	sentToTokens[sent["sent_id"]] = sent["tokens"]

	# create class members
	self.Refs = Refs
	self.Anns = Anns
	self.Imgs = Imgs
	self.Cats = Cats
	self.Sents = Sents
	self.imgToRefs = imgToRefs
	self.imgToAnns = imgToAnns
	self.refToAnn = refToAnn
	self.annToRef = annToRef
	self.catToRefs = catToRefs
	self.sentToRef = sentToRef
	self.sentToTokens = sentToTokens
	logger.info("index created.")
	"""
	Dataset Statistic:
	refcoco-unc
	Refs 50000
	Anns 196771
	Imgs 19994
	Cats 80
	Sents 142210
	imgToRefs 19994
	imgToAnns 19994
	refToAnn 50000
	annToRef 50000
	catToRefs 78
	sentToRef 142210
	sentToTokens 142210
	"""

	def getRefIds(self, image_ids=[], cat_ids=[], ref_ids=[], split=""):
	image_ids = image_ids if type(image_ids) == list else [image_ids]
	cat_ids = cat_ids if type(cat_ids) == list else [cat_ids]
	ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]

	if len(image_ids) == len(cat_ids) == len(ref_ids) == len(split) == 0:
	refs = self.data["refs"]
	else:
	if not len(image_ids) == 0:
	refs = [self.imgToRefs[image_id] for image_id in image_ids]
	else:
	refs = self.data["refs"]
	if not len(cat_ids) == 0:
	refs = [ref for ref in refs if ref["category_id"] in cat_ids]
	if not len(ref_ids) == 0:
	refs = [ref for ref in refs if ref["ref_id"] in ref_ids]
	if not len(split) == 0:
	if split in ["testA", "testB", "testC"]:
	# we also consider testAB, testBC, ...
	refs = [ref for ref in refs if split[-1] in ref["split"]]
	elif split in ["testAB", "testBC", "testAC"]:
	# rarely used I guess...
	refs = [ref for ref in refs if ref["split"] == split]
	elif split == "test":
	refs = [ref for ref in refs if "test" in ref["split"]]
	elif split == "train" or split == "val":
	refs = [ref for ref in refs if ref["split"] == split]
	else:
	raise ValueError("No such split [%s]" % split)
	ref_ids = [ref["ref_id"] for ref in refs]
	return ref_ids

	def getImgIds(self, ref_ids=[]):
	ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]

	if not len(ref_ids) == 0:
	image_ids = list(set([self.Refs[ref_id]["image_id"] for ref_id in ref_ids]))
	else:
	image_ids = list(self.Imgs.keys())
	return image_ids