import json import os import pickle import logging import datasets import pycocotools.mask as mask import dotenv logger = logging.getLogger(__name__) # Add BibTeX citation # Find for instance the citation on arxiv or on the dataset repo/website _CITATION = """\ @article{DBLP:journals/corr/LinMBHPRDZ14, author = {Tsung{-}Yi Lin and Michael Maire and Serge J. Belongie and Lubomir D. Bourdev and Ross B. Girshick and James Hays and Pietro Perona and Deva Ramanan and Piotr Doll{'{a} }r and C. Lawrence Zitnick}, title = {Microsoft {COCO:} Common Objects in Context}, journal = {CoRR}, volume = {abs/1405.0312}, year = {2014}, url = {http://arxiv.org/abs/1405.0312}, archivePrefix = {arXiv}, eprint = {1405.0312}, timestamp = {Mon, 13 Aug 2018 16:48:13 +0200}, biburl = {https://dblp.org/rec/bib/journals/corr/LinMBHPRDZ14}, bibsource = {dblp computer science bibliography, https://dblp.org} } """ # Add description of the dataset here # You can copy an official description _DESCRIPTION = """\ COCO is a large-scale object detection, segmentation, and captioning dataset. """ # Add a link to an official homepage for the dataset here _HOMEPAGE = "http://cocodataset.org/#home" # Add the licence for the dataset here if you can find it _LICENSE = "" # Add link to the official dataset URLs here # The HuggingFace dataset library don't host the datasets but only point to the original files # This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method) # This script is supposed to work with local (downloaded) COCO dataset. _URLs = {} _BASE_REGION_FEATURES = { "region_id": datasets.Value("int64"), "image_id": datasets.Value("int32"), "phrases": [datasets.Value("string")], "x": datasets.Value("int32"), "y": datasets.Value("int32"), "width": datasets.Value("int32"), "height": datasets.Value("int32"), } _BASE_MASK_FEATURES = { "size": [datasets.Value("int32")], "counts": datasets.Value("string"), } _BASE_MASK_REGION_FEATURES = { "region_id": datasets.Value("int64"), "image_id": datasets.Value("int32"), "phrases": [datasets.Value("string")], "x": datasets.Value("int32"), "y": datasets.Value("int32"), "width": datasets.Value("int32"), "height": datasets.Value("int32"), "mask": _BASE_MASK_FEATURES, } _ANNOTATION_FEATURES = { "region_descriptions": {"regions": [_BASE_REGION_FEATURES]}, "mask_region_descriptions": {"regions": [_BASE_MASK_REGION_FEATURES]}, } _BASE_IMAGE_METADATA_FEATURES = { "image_id": datasets.Value("int32"), # "caption_id": datasets.Value("int64"), # "caption": datasets.Value("string"), "height": datasets.Value("int32"), "width": datasets.Value("int32"), "file_name": datasets.Value("string"), "coco_url": datasets.Value("string"), # "image_path": datasets.Value("string"), "task_type": datasets.Value("string"), } _SPLIT_BYS = { "refclef": ["unc", "berkeley"], # NOTE: use refer2 by UNC authors # "refcoco": ["unc", "google"], "refcoco": ["unc"], "refcoco+": ["unc"], "refcocog": ["umd", "google"], } _SPLITS = { "refclef-unc": ["train", "val", "testA", "testB", "testC"], "refclef-berkeley": ["train", "val", "test"], # **{f"refcoco-{_split_by}": ["train", "val", "test"] for _split_by in _SPLIT_BYS["refcoco"]}, # **{f"refcoco+-{_split_by}": ["train", "val", "test"] for _split_by in _SPLIT_BYS["refcoco+"]}, **{f"refcoco-{_split_by}": ["train", "val", "testA", "testB"] for _split_by in _SPLIT_BYS["refcoco"]}, **{f"refcoco+-{_split_by}": ["train", "val", "testA", "testB"] for _split_by in _SPLIT_BYS["refcoco+"]}, **{f"refcocog-{_split_by}": ["train", "val"] for _split_by in _SPLIT_BYS["refcocog"]}, } datasets.Split("testA") datasets.Split("testB") class RefCOCOBuilderConfig(datasets.BuilderConfig): def __init__( self, name, splits, with_image=True, with_mask=True, base_url=None, sas_key=None, task_type="caption", **kwargs, ): super().__init__(name, **kwargs) self.splits = splits self.dataset_name = name.split("-")[0] self.split_by = name.split("-")[-1] self.with_image = with_image self.with_mask = with_mask self.base_url = base_url self.sas_key = sas_key self.task_type = task_type @property def features(self): annoation_type = "mask_region_descriptions" if self.with_mask else "region_descriptions" logger.info(f"Using annotation type: {annoation_type} due to with_mask={self.with_mask}") return datasets.Features( { **({"image": datasets.Image()} if self.with_image else {}), **_BASE_IMAGE_METADATA_FEATURES, **_ANNOTATION_FEATURES[annoation_type], } ) # Name of the dataset usually match the script name with CamelCase instead of snake_case class RefCOCODataset(datasets.GeneratorBasedBuilder): """An example dataset script to work with the local (downloaded) COCO dataset""" VERSION = datasets.Version("0.0.0") BUILDER_CONFIG_CLASS = RefCOCOBuilderConfig BUILDER_CONFIGS = [RefCOCOBuilderConfig(name=name, splits=splits) for name, splits in _SPLITS.items()] DEFAULT_CONFIG_NAME = "refcoco-unc" config: RefCOCOBuilderConfig def _info(self): # This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset features = self.config.features return datasets.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # This defines the different columns of the dataset and their types features=features, # Here we define them above because they are different between the two configurations # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage=_HOMEPAGE, # License for the dataset if available license=_LICENSE, # Citation for the dataset citation=_CITATION, ) def _split_generators(self, dl_manager): """Returns SplitGenerators.""" # This method is tasked with downloading/extracting the data and defining the splits depending on the configuration # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name # NOTE: we use base_url instead of data_dir # When we use data_dir, all the paths are relative to the data_dir. base_url = self.config.base_url if base_url is None: raise ValueError( "This script is supposed to work with local or remote RefCOCO dataset. It is either a local path or remote url. The argument `base_url` in `load_dataset()` is required." ) logger.info(f"Using base_url: {base_url}") # _DL_URLS = { # "train": os.path.join(data_dir, "train2017.zip"), # "val": os.path.join(data_dir, "val2017.zip"), # "test": os.path.join(data_dir, "test2017.zip"), # "annotations_trainval": os.path.join(data_dir, "annotations_trainval2017.zip"), # "image_info_test": os.path.join(data_dir, "image_info_test2017.zip"), # } _DL_URLS = {} if self.config.dataset_name in ["refcoco", "refcoco+", "refcocog"]: _DL_URLS["image_dir"] = os.path.join(base_url, "train2014.zip") elif self.config.dataset_name == "refclef": _DL_URLS["image_dir"] = os.path.join(base_url, "saiapr_tc-12.zip") else: raise ValueError(f"Unknown dataset name: {self.config.dataset_name}") _DL_URLS["annotation_dir"] = os.path.join(base_url, f"{self.config.dataset_name}.zip") sas_key = self.config.sas_key if sas_key is None: # NOTE(xiaoke): load sas_key from .env logger.info(f"Try to load sas_key from .env file: {dotenv.load_dotenv('.env')}.") sas_key = os.getenv("REFCOCO_SAS_KEY") if sas_key is not None and not os.path.exists(base_url): logger.info(f"Using sas_key: {sas_key}") _DL_URLS = {k: f"{v}{sas_key}" for k, v in _DL_URLS.items()} if dl_manager.is_streaming is True: raise ValueError( "dl_manager.is_streaming is True, which is very slow due to the random access inside zip files with streaming loading." ) archive_path = dl_manager.download_and_extract(_DL_URLS) # NOTE(xiaoke): prepare data for index generation with open( os.path.join(archive_path["annotation_dir"], self.config.dataset_name, f"refs({self.config.split_by}).p"), "rb", ) as fp: refs = pickle.load(fp) with open( os.path.join(archive_path["annotation_dir"], self.config.dataset_name, f"instances.json"), "r", encoding="UTF-8", ) as fp: instances = json.load(fp) self.data = {} self.data["dataset"] = self.config.dataset_name self.data["refs"] = refs self.data["images"] = instances["images"] self.data["annotations"] = instances["annotations"] self.data["categories"] = instances["categories"] self.createIndex() print(f"num refs: {len(self.Refs)}") splits = [] for split in self.config.splits: if split == "train": dataset = datasets.SplitGenerator( name=datasets.Split.TRAIN, # These kwargs will be passed to _generate_examples # gen_kwargs={ # "json_path": os.path.join( # archive_path["annotations_trainval"], "annotations", "captions_train2017.json" # ), # "image_dir": os.path.join(archive_path["train"], "train2017"), # "split": "train", # }, gen_kwargs={ "image_dir": archive_path["image_dir"], "split": split, }, ) elif split in ["val"]: dataset = datasets.SplitGenerator( name=datasets.Split.VALIDATION, # These kwargs will be passed to _generate_examples # gen_kwargs={ # "json_path": os.path.join( # archive_path["annotations_trainval"], "annotations", "captions_val2017.json" # ), # "image_dir": os.path.join(archive_path["val"], "val2017"), # "split": "valid", # }, gen_kwargs={ "image_dir": archive_path["image_dir"], "split": split, }, ) elif split == "test": dataset = datasets.SplitGenerator( name=datasets.Split.TEST, # These kwargs will be passed to _generate_examples # gen_kwargs={ # "json_path": os.path.join( # archive_path["image_info_test"], "annotations", "image_info_test2017.json" # ), # "image_dir": os.path.join(archive_path["test"], "test2017"), # "split": "test", # }, gen_kwargs={ "image_dir": archive_path["image_dir"], "split": split, }, ) elif split in ["testA", "testB", "testC"]: dataset = datasets.SplitGenerator( name=datasets.Split(split), # These kwargs will be passed to _generate_examples # gen_kwargs={ # "json_path": os.path.join( # archive_path["image_info_test"], "annotations", "image_info_test2017.json" # ), # "image_dir": os.path.join(archive_path["test"], "test2017"), # "split": "test", # }, gen_kwargs={ "image_dir": archive_path["image_dir"], "split": split, }, ) else: raise ValueError(f"Unknown split name: {split}") splits.append(dataset) return splits def _generate_examples( # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` self, image_dir, split, ): """Yields examples as (key, example) tuples.""" # This method handles input defined in _split_generators to yield (key, example) tuples from the dataset. # The `key` is here for legacy reason (tfds) and is not important in itself. ref_ids = self.getRefIds(split=split) img_ids = self.getImgIds(ref_ids=ref_ids) logger.info(f"Generating examples from {len(ref_ids)} refs and {len(img_ids)} images in split {split}...") if self.config.dataset_name in ["refcoco", "refcoco+", "refcocog"]: image_dir_name = "train2014" elif self.config.dataset_name == "refclef": image_dir_name = "saiapr_tc-12" else: raise ValueError(f"Unknown dataset name: {self.config.dataset_name}") for idx, img_id in enumerate(img_ids): img = self.Imgs[img_id] image_metadata = { "coco_url": img.get("coco_url", None), "file_name": img["file_name"], "height": img["height"], "width": img["width"], "image_id": img["id"], } image_dict = ( {"image": os.path.join(image_dir, image_dir_name, img["file_name"])} if self.config.with_image else {} ) annotation = [] img_to_refs = self.imgToRefs[img_id] for img_to_ref in img_to_refs: ref_to_ann = self.refToAnn[img_to_ref["ref_id"]] x, y, width, height = ref_to_ann["bbox"] # NOTE: we need to convert float to int annotation_dict = { "image_id": img_to_ref["image_id"], "region_id": img_to_ref["ref_id"], "x": int(x), "y": int(y), "width": int(width), "height": int(height), } annotation_dict["phrases"] = [sent["sent"] for sent in img_to_ref["sentences"]] if self.config.with_mask: if type(ref_to_ann["segmentation"][0]) == list: rle = mask.frPyObjects(ref_to_ann["segmentation"], img["height"], img["width"]) else: rle = ref_to_ann["segmentation"] mask_dict = rle[0] # should be a dict, rather a list annotation_dict["mask"] = { "size": mask_dict["size"], "counts": mask_dict["counts"].decode("utf-8"), # NOTE: otherwise, it leads to core dump error. } annotation.append(annotation_dict) annotation = {"regions": annotation} yield idx, {**image_dict, **image_metadata, **annotation, "task_type": self.config.task_type} """ { 'coco_url': Value(dtype='string', id=None), 'file_name': Value(dtype='string', id=None), 'height': Value(dtype='int32', id=None), 'image': Image(decode=True, id=None), 'image_id': Value(dtype='int32', id=None), 'regions': [{ 'height': Value(dtype='int32', id=None), 'image_id': Value(dtype='int32', id=None), 'mask': { 'counts': Value(dtype='string', id=None), 'size': [Value(dtype='int32', id=None)] }, 'phrases': [Value(dtype='string', id=None)], 'region_id': Value(dtype='int32', id=None), 'width': Value(dtype='int32', id=None), 'x': Value(dtype='int32', id=None), 'y': Value(dtype='int32', id=None) }], 'width': Value(dtype='int32', id=None) } """ # _features = [ # "image_id", # "caption_id", # "caption", # "height", # "width", # "file_name", # "coco_url", # "image_path", # "id", # ] # features = list(_features) # if split in "valid": # split = "val" # with open(json_path, "r", encoding="UTF-8") as fp: # data = json.load(fp) # # list of dict # images = data["images"] # entries = images # # build a dict of image_id -> image info dict # d = {image["id"]: image for image in images} # # list of dict # if split in ["train", "val"]: # annotations = data["annotations"] # # build a dict of image_id -> # for annotation in annotations: # _id = annotation["id"] # image_info = d[annotation["image_id"]] # annotation.update(image_info) # annotation["id"] = _id # entries = annotations # for id_, entry in enumerate(entries): # entry = {k: v for k, v in entry.items() if k in features} # if split == "test": # entry["image_id"] = entry["id"] # entry["id"] = -1 # entry["caption"] = -1 # entry["caption_id"] = entry.pop("id") # entry["image_path"] = os.path.join(image_dir, entry["file_name"]) # entry = {k: entry[k] for k in _features if k in entry} # yield str((entry["image_id"], entry["caption_id"])), entry def createIndex(self): # create sets of mapping # 1) Refs: {ref_id: ref} # 2) Anns: {ann_id: ann} # 3) Imgs: {image_id: image} # 4) Cats: {category_id: category_name} # 5) Sents: {sent_id: sent} # 6) imgToRefs: {image_id: refs} # 7) imgToAnns: {image_id: anns} # 8) refToAnn: {ref_id: ann} # 9) annToRef: {ann_id: ref} # 10) catToRefs: {category_id: refs} # 11) sentToRef: {sent_id: ref} # 12) sentToTokens: {sent_id: tokens} logger.info(f"creating index for {self.config.name}...") # fetch info from instances Anns, Imgs, Cats, imgToAnns = {}, {}, {}, {} for ann in self.data["annotations"]: Anns[ann["id"]] = ann imgToAnns[ann["image_id"]] = imgToAnns.get(ann["image_id"], []) + [ann] for img in self.data["images"]: Imgs[img["id"]] = img for cat in self.data["categories"]: Cats[cat["id"]] = cat["name"] # fetch info from refs Refs, imgToRefs, refToAnn, annToRef, catToRefs = {}, {}, {}, {}, {} Sents, sentToRef, sentToTokens = {}, {}, {} for ref in self.data["refs"]: # ids ref_id = ref["ref_id"] ann_id = ref["ann_id"] category_id = ref["category_id"] image_id = ref["image_id"] # add mapping related to ref Refs[ref_id] = ref imgToRefs[image_id] = imgToRefs.get(image_id, []) + [ref] catToRefs[category_id] = catToRefs.get(category_id, []) + [ref] refToAnn[ref_id] = Anns[ann_id] annToRef[ann_id] = ref # add mapping of sent for sent in ref["sentences"]: Sents[sent["sent_id"]] = sent sentToRef[sent["sent_id"]] = ref sentToTokens[sent["sent_id"]] = sent["tokens"] # create class members self.Refs = Refs self.Anns = Anns self.Imgs = Imgs self.Cats = Cats self.Sents = Sents self.imgToRefs = imgToRefs self.imgToAnns = imgToAnns self.refToAnn = refToAnn self.annToRef = annToRef self.catToRefs = catToRefs self.sentToRef = sentToRef self.sentToTokens = sentToTokens logger.info("index created.") """ Dataset Statistic: refcoco-unc Refs 50000 Anns 196771 Imgs 19994 Cats 80 Sents 142210 imgToRefs 19994 imgToAnns 19994 refToAnn 50000 annToRef 50000 catToRefs 78 sentToRef 142210 sentToTokens 142210 """ def getRefIds(self, image_ids=[], cat_ids=[], ref_ids=[], split=""): image_ids = image_ids if type(image_ids) == list else [image_ids] cat_ids = cat_ids if type(cat_ids) == list else [cat_ids] ref_ids = ref_ids if type(ref_ids) == list else [ref_ids] if len(image_ids) == len(cat_ids) == len(ref_ids) == len(split) == 0: refs = self.data["refs"] else: if not len(image_ids) == 0: refs = [self.imgToRefs[image_id] for image_id in image_ids] else: refs = self.data["refs"] if not len(cat_ids) == 0: refs = [ref for ref in refs if ref["category_id"] in cat_ids] if not len(ref_ids) == 0: refs = [ref for ref in refs if ref["ref_id"] in ref_ids] if not len(split) == 0: if split in ["testA", "testB", "testC"]: # we also consider testAB, testBC, ... refs = [ref for ref in refs if split[-1] in ref["split"]] elif split in ["testAB", "testBC", "testAC"]: # rarely used I guess... refs = [ref for ref in refs if ref["split"] == split] elif split == "test": refs = [ref for ref in refs if "test" in ref["split"]] elif split == "train" or split == "val": refs = [ref for ref in refs if ref["split"] == split] else: raise ValueError("No such split [%s]" % split) ref_ids = [ref["ref_id"] for ref in refs] return ref_ids def getImgIds(self, ref_ids=[]): ref_ids = ref_ids if type(ref_ids) == list else [ref_ids] if not len(ref_ids) == 0: image_ids = list(set([self.Refs[ref_id]["image_id"] for ref_id in ref_ids])) else: image_ids = list(self.Imgs.keys()) return image_ids