# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Visual Genome dataset.""" import json import os import re from collections import defaultdict from typing import Any, Callable, Dict, Optional from urllib.parse import urlparse import datasets import dotenv logger = datasets.logging.get_logger(__name__) _CITATION = """\ @inproceedings{krishnavisualgenome, title={Visual Genome: Connecting Language and Vision Using Crowdsourced Dense Image Annotations}, author={Krishna, Ranjay and Zhu, Yuke and Groth, Oliver and Johnson, Justin and Hata, Kenji and Kravitz, Joshua and Chen, Stephanie and Kalantidis, Yannis and Li, Li-Jia and Shamma, David A and Bernstein, Michael and Fei-Fei, Li}, year = {2016}, url = {https://arxiv.org/abs/1602.07332}, } """ _DESCRIPTION = """\ Visual Genome enable to model objects and relationships between objects. They collect dense annotations of objects, attributes, and relationships within each image. Specifically, the dataset contains over 108K images where each image has an average of 35 objects, 26 attributes, and 21 pairwise relationships between objects. """ _HOMEPAGE = "https://homes.cs.washington.edu/~ranjay/visualgenome/" _LICENSE = "Creative Commons Attribution 4.0 International License" _BASE_IMAGE_URLS = { "https://cs.stanford.edu/people/rak248/VG_100K_2/images.zip": "VG_100K", "https://cs.stanford.edu/people/rak248/VG_100K_2/images2.zip": "VG_100K_2", } _LATEST_VERSIONS = { "mask_region_descriptions": "0.0.1", "region_descriptions": "1.2.0", "objects": "1.4.0", "attributes": "1.2.0", "relationships": "1.4.0", "question_answers": "1.2.0", "image_metadata": "1.2.0", } # ---- Features ---- # NOTE: to be compatible with the customed COCO format. _BASE_IMAGE_METADATA_FEATURES = { "image_id": datasets.Value("int32"), "coco_url": datasets.Value("string"), "file_name": datasets.Value("string"), "width": datasets.Value("int32"), "height": datasets.Value("int32"), # "coco_id": datasets.Value("int64"), # "flickr_id": datasets.Value("int64"), "task_type": datasets.Value("string"), } _BASE_SYNTET_FEATURES = { "synset_name": datasets.Value("string"), "entity_name": datasets.Value("string"), "entity_idx_start": datasets.Value("int32"), "entity_idx_end": datasets.Value("int32"), } _BASE_OBJECT_FEATURES = { "object_id": datasets.Value("int32"), "x": datasets.Value("int32"), "y": datasets.Value("int32"), "w": datasets.Value("int32"), "h": datasets.Value("int32"), "names": [datasets.Value("string")], "synsets": [datasets.Value("string")], } _BASE_QA_OBJECT_FEATURES = { "object_id": datasets.Value("int32"), "x": datasets.Value("int32"), "y": datasets.Value("int32"), "w": datasets.Value("int32"), "h": datasets.Value("int32"), "names": [datasets.Value("string")], "synsets": [datasets.Value("string")], } _BASE_QA_OBJECT = { "qa_id": datasets.Value("int32"), "image_id": datasets.Value("int32"), "question": datasets.Value("string"), "answer": datasets.Value("string"), "a_objects": [_BASE_QA_OBJECT_FEATURES], "q_objects": [_BASE_QA_OBJECT_FEATURES], } _BASE_REGION_FEATURES = { "region_id": datasets.Value("int64"), "image_id": datasets.Value("int32"), "phrases": [datasets.Value("string")], "x": datasets.Value("int32"), "y": datasets.Value("int32"), "width": datasets.Value("int32"), "height": datasets.Value("int32"), } _BASE_MASK_FEATURES = { "size": [datasets.Value("int32")], "counts": datasets.Value("string"), } _BASE_MASK_REGION_FEATURES = { "region_id": datasets.Value("int64"), "image_id": datasets.Value("int32"), "phrases": [datasets.Value("string")], "x": datasets.Value("int32"), "y": datasets.Value("int32"), "width": datasets.Value("int32"), "height": datasets.Value("int32"), "mask": _BASE_MASK_FEATURES, } _BASE_RELATIONSHIP_FEATURES = { "relationship_id": datasets.Value("int32"), "predicate": datasets.Value("string"), "synsets": datasets.Value("string"), "subject": _BASE_OBJECT_FEATURES, "object": _BASE_OBJECT_FEATURES, } _NAME_VERSION_TO_ANNOTATION_FEATURES = { "mask_region_descriptions": { "0.0.1": {"regions": [_BASE_MASK_REGION_FEATURES]}, }, "region_descriptions": { "1.2.0": {"regions": [_BASE_REGION_FEATURES]}, "1.0.0": {"regions": [_BASE_REGION_FEATURES]}, }, "objects": { "1.4.0": { "objects": [ { **_BASE_OBJECT_FEATURES, "merged_object_ids": [datasets.Value("int32")], } ] }, "1.2.0": {"objects": [_BASE_OBJECT_FEATURES]}, "1.0.0": {"objects": [_BASE_OBJECT_FEATURES]}, }, "attributes": { "1.2.0": {"attributes": [{**_BASE_OBJECT_FEATURES, "attributes": [datasets.Value("string")]}]}, "1.0.0": {"attributes": [{**_BASE_OBJECT_FEATURES, "attributes": [datasets.Value("string")]}]}, }, "relationships": { "1.4.0": { "relationships": [ { **_BASE_RELATIONSHIP_FEATURES, "subject": { **_BASE_OBJECT_FEATURES, "merged_object_ids": [datasets.Value("int32")], }, "object": { **_BASE_OBJECT_FEATURES, "merged_object_ids": [datasets.Value("int32")], }, } ] }, "1.2.0": {"relationships": [_BASE_RELATIONSHIP_FEATURES]}, "1.0.0": {"relationships": [_BASE_RELATIONSHIP_FEATURES]}, }, "question_answers": { "1.2.0": {"qas": [_BASE_QA_OBJECT]}, "1.0.0": {"qas": [_BASE_QA_OBJECT]}, }, } # ----- Helpers ----- def _get_decompressed_filename_from_url(url: str) -> str: parsed_url = urlparse(url) compressed_filename = os.path.basename(parsed_url.path) # Remove `.zip` suffix assert compressed_filename.endswith(".zip") uncompressed_filename = compressed_filename[:-4] # Remove version. unversioned_uncompressed_filename = re.sub(r"_v[0-9]+(?:_[0-9]+)?\.json$", ".json", uncompressed_filename) return unversioned_uncompressed_filename def _get_local_image_path(img_url: str, folder_local_paths: Dict[str, str]) -> str: """Obtain image folder given an image url. For example: Given `https://cs.stanford.edu/people/rak248/VG_100K_2/1.jpg` as an image url, this method returns the local path for that image. """ matches = re.fullmatch( r"^https://cs.stanford.edu/people/rak248/(VG_100K(?:_2)?)/([0-9]+\.jpg)$", img_url, ) assert matches is not None, f"Got img_url: {img_url}, matched: {matches}" folder, filename = matches.group(1), matches.group(2) return os.path.join(folder_local_paths[folder], filename) def _get_local_image_suffix_path(img_url: str) -> str: """Obtain image folder given an image url. For example: Given `https://cs.stanford.edu/people/rak248/VG_100K_2/1.jpg` as an image url, this method returns the local path for that image. """ matches = re.fullmatch( r"^https://cs.stanford.edu/people/rak248/(VG_100K(?:_2)?)/([0-9]+\.jpg)$", img_url, ) assert matches is not None, f"Got img_url: {img_url}, matched: {matches}" folder, filename = matches.group(1), matches.group(2) return os.path.join(folder, filename) # ----- Annotation normalizers ---- _BASE_ANNOTATION_URL = "https://homes.cs.washington.edu/~ranjay/visualgenome/data/dataset" def _normalize_region_description_annotation_(annotation: Dict[str, Any]) -> Dict[str, Any]: """Normalizes region descriptions annotation in-place.""" # Some attributes annotations don't have an attribute field for region in annotation["regions"]: # `id` should be converted to `region_id`: if "id" in region: region["region_id"] = region["id"] del region["id"] # `image` should be converted to `image_id` if "image" in region: region["image_id"] = region["image"] del region["image"] # NOTE(xiaoke): modify the `phrase` field to `phrases` field to be consistent with other annotations with multiple phrases if "phrase" in region: region["phrases"] = [region["phrase"]] if isinstance(region["phrase"], str) else region["phrase"] del region["phrase"] return annotation def _normalize_object_annotation_(annotation: Dict[str, Any]) -> Dict[str, Any]: """Normalizes object annotation in-place.""" # Some attributes annotations don't have an attribute field for object_ in annotation["objects"]: # `id` should be converted to `object_id`: if "id" in object_: object_["object_id"] = object_["id"] del object_["id"] # Some versions of `object` annotations don't have `synsets` field. if "synsets" not in object_: object_["synsets"] = None return annotation def _normalize_attribute_annotation_(annotation: Dict[str, Any]) -> Dict[str, Any]: """Normalizes attributes annotation in-place.""" # Some attributes annotations don't have an attribute field for attribute in annotation["attributes"]: # `id` should be converted to `object_id`: if "id" in attribute: attribute["object_id"] = attribute["id"] del attribute["id"] # `objects_names` should be converted to `names: if "object_names" in attribute: attribute["names"] = attribute["object_names"] del attribute["object_names"] # Some versions of `attribute` annotations don't have `synsets` field. if "synsets" not in attribute: attribute["synsets"] = None # Some versions of `attribute` annotations don't have `attributes` field. if "attributes" not in attribute: attribute["attributes"] = None return annotation def _normalize_relationship_annotation_(annotation: Dict[str, Any]) -> Dict[str, Any]: """Normalizes relationship annotation in-place.""" # For some reason relationships objects have a single name instead of a list of names. for relationship in annotation["relationships"]: # `id` should be converted to `object_id`: if "id" in relationship: relationship["relationship_id"] = relationship["id"] del relationship["id"] if "synsets" not in relationship: relationship["synsets"] = None subject = relationship["subject"] object_ = relationship["object"] for obj in [subject, object_]: # `id` should be converted to `object_id`: if "id" in obj: obj["object_id"] = obj["id"] del obj["id"] if "name" in obj: obj["names"] = [obj["name"]] del obj["name"] if "synsets" not in obj: obj["synsets"] = None return annotation def _normalize_image_metadata_(image_metadata: Dict[str, Any]) -> Dict[str, Any]: """Normalizes image metadata in-place.""" if "id" in image_metadata: image_metadata["image_id"] = image_metadata["id"] del image_metadata["id"] return image_metadata _ANNOTATION_NORMALIZER = defaultdict(lambda: lambda x: x) _ANNOTATION_NORMALIZER.update( { "region_descriptions": _normalize_region_description_annotation_, "objects": _normalize_object_annotation_, "attributes": _normalize_attribute_annotation_, "relationships": _normalize_relationship_annotation_, } ) # No need to normalize "mask_region_descriptions", # since it is based on "region_descriptions", # which has already been normalized. # ---- Visual Genome loading script ---- class VisualGenomeConfig(datasets.BuilderConfig): """BuilderConfig for Visual Genome.""" def __init__( self, name: str, version: Optional[str] = None, with_image: bool = True, base_image_url: Optional[str] = None, base_annotation_url: Optional[str] = None, sas_key: Optional[str] = None, use_densecap_splits: bool = False, task_type: str = "caption", **kwargs, ): _version = _LATEST_VERSIONS[name] if version is None else version _name = f"{name}_v{_version}" super().__init__(version=datasets.Version(_version), name=_name, **kwargs) self._name_without_version = name self.annotations_features = _NAME_VERSION_TO_ANNOTATION_FEATURES[self._name_without_version][ self.version.version_str ] self.with_image = with_image # NOTE(xiaoke): to download files from azure self.base_annotation_url = base_annotation_url self.base_image_url = base_image_url self.sas_key = sas_key self.use_densecap_splits = use_densecap_splits self.task_type = task_type @property def image_zip_paths(self): if self.base_image_url is None: logger.warning("base_url is None. Using default base urls. Maybe unable to download images.") _image_zip_paths = _BASE_IMAGE_URLS else: if self.sas_key is None: sas_key = "" else: sas_key = self.sas_key _image_zip_paths = { f"{self.base_image_url}/images.zip{sas_key}": "VG_100K", f"{self.base_image_url}/images2.zip{sas_key}": "VG_100K_2", } logger.info(f"image_zip_paths: {_image_zip_paths}") return _image_zip_paths @property def annotations_url(self): if self.base_annotation_url is None: logger.warning("base_url is None. Using default base urls. Maybe unable to download annotations.") base_annotation_url = _BASE_ANNOTATION_URL sas_key = "" else: base_annotation_url = self.base_annotation_url if self.sas_key is None: sas_key = "" else: sas_key = self.sas_key major, minor = self.version.major, self.version.minor if self.version == _LATEST_VERSIONS[self._name_without_version]: _annotations_url = f"{base_annotation_url}/{self._name_without_version}.json.zip{sas_key}" elif minor == 0: _annotations_url = f"{base_annotation_url}/{self._name_without_version}_v{major}.json.zip{sas_key}" else: _annotations_url = f"{base_annotation_url}/{self._name_without_version}_v{major}_{minor}.json.zip{sas_key}" logger.info(f"annotations_url: {_annotations_url}") return _annotations_url @property def image_metadata_url(self): if self.base_annotation_url is None: logger.warning("base_url is None. Using default base urls. Maybe unable to download annotations.") base_annotation_url = _BASE_ANNOTATION_URL sas_key = "" else: base_annotation_url = self.base_annotation_url if self.sas_key is None: sas_key = "" else: sas_key = self.sas_key if not self.version == _LATEST_VERSIONS["image_metadata"]: logger.warning( f"Latest image metadata version is {_LATEST_VERSIONS['image_metadata']}. Trying to generate a dataset of version: {self.version}. Please double check that image data are unchanged between the two versions." ) _image_metadata_url = f"{base_annotation_url}/image_data.json.zip{sas_key}" logger.info(f"image_metadata_url: {_image_metadata_url}") return _image_metadata_url @property def features(self): return datasets.Features( { **({"image": datasets.Image()} if self.with_image else {}), **_BASE_IMAGE_METADATA_FEATURES, **self.annotations_features, } ) @property def densecap_splits_json_url(self): # NOTE: densecap_splits.json is not included in the original Visual Genome dataset. # We download it from "https://raw.githubusercontent.com/jcjohnson/densecap/master/info/densecap_splits.json". if self.base_annotation_url is None: logger.warning("base_url is None. Using default base urls. Maybe unable to download annotations.") base_annotation_url = _BASE_ANNOTATION_URL sas_key = "" else: base_annotation_url = self.base_annotation_url if self.sas_key is None: sas_key = "" else: sas_key = self.sas_key return f"{base_annotation_url}/densecap_splits.json{sas_key}" class VisualGenome(datasets.GeneratorBasedBuilder): """Visual Genome dataset.""" BUILDER_CONFIG_CLASS = VisualGenomeConfig BUILDER_CONFIGS = [ *[VisualGenomeConfig(name="mask_region_descriptions", version=version) for version in ["0.0.1"]], *[VisualGenomeConfig(name="region_descriptions", version=version) for version in ["1.0.0", "1.2.0"]], *[VisualGenomeConfig(name="question_answers", version=version) for version in ["1.0.0", "1.2.0"]], *[ VisualGenomeConfig(name="objects", version=version) # TODO: add support for 1.4.0 for version in ["1.0.0", "1.2.0"] ], *[VisualGenomeConfig(name="attributes", version=version) for version in ["1.0.0", "1.2.0"]], *[ VisualGenomeConfig(name="relationships", version=version) # TODO: add support for 1.4.0 for version in ["1.0.0", "1.2.0"] ], ] def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=self.config.features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, version=self.config.version, ) _SPLIT_NAME_MAP = { "train": "TRAIN", "val": "VALIDATION", "test": "TEST", } def _split_generators(self, dl_manager): self.config: VisualGenomeConfig # prepare sas_key if self.config.sas_key is None: # NOTE(xiaoke): load sas_key from .env logger.info(f"Try to load sas_key from .env file: {dotenv.load_dotenv('.env')}.") self.config.sas_key = os.getenv("VISUAL_GENOME_SAS_KEY") if self.config.sas_key is not None: logger.info(f"Using sas_key: {self.config.sas_key}") # Download image meta data. if dl_manager.is_streaming is True: raise ValueError( "dl_manager.is_streaming is True, which is very slow due to the random access inside zip files with streaming loading." ) image_metadatas_dir = dl_manager.download_and_extract(self.config.image_metadata_url) image_metadatas_file = os.path.join( image_metadatas_dir, _get_decompressed_filename_from_url(self.config.image_metadata_url), ) # Download annotations annotations_dir = dl_manager.download_and_extract(self.config.annotations_url) annotations_file = os.path.join( annotations_dir, _get_decompressed_filename_from_url(self.config.annotations_url), ) logger.info(f"annotations_file: {annotations_file}") logger.info(f"image_metadatas_file: {image_metadatas_file}") logger.info(f"annotations_dir: {annotations_dir}") logger.info(f"image_metadatas_dir: {image_metadatas_dir}") if self.config.use_densecap_splits: splits_path = dl_manager.download_and_extract(self.config.densecap_splits_json_url) logger.info(f"densecap splits_path: {splits_path}") with open(splits_path, encoding="utf-8") as fi: _splits = json.load(fi) splits = {name: [] for name in _splits.keys()} with open(image_metadatas_file, encoding="utf-8") as fi: image_metadatas = json.load(fi) image_idx_to_sample_idx = { image_metadata["image_id"]: sample_idx for sample_idx, image_metadata in enumerate(image_metadatas) } splits = {} for name, image_id_list in _splits.items(): splits[name] = [image_idx_to_sample_idx[image_id] for image_id in image_id_list] else: splits = dict(train=None) # Optionally download images if self.config.with_image: image_folder_keys = list(self.config.image_zip_paths.keys()) image_dirs = dl_manager.download_and_extract(image_folder_keys) image_folder_local_paths = { self.config.image_zip_paths[key]: os.path.join(dir_, self.config.image_zip_paths[key]) for key, dir_ in zip(image_folder_keys, image_dirs) } else: image_folder_local_paths = None return [ datasets.SplitGenerator( name=getattr(datasets.Split, self._SPLIT_NAME_MAP[split]), gen_kwargs={ "image_folder_local_paths": image_folder_local_paths, "image_metadatas_file": image_metadatas_file, "annotations_file": annotations_file, "annotation_normalizer_": _ANNOTATION_NORMALIZER[self.config._name_without_version], "split_sample_idx_list": splits[split], }, ) for split in splits ] def _generate_examples( self, image_folder_local_paths: Optional[Dict[str, str]], image_metadatas_file: str, annotations_file: str, annotation_normalizer_: Callable[[Dict[str, Any]], Dict[str, Any]], split_sample_idx_list: Optional[list] = None, ): with open(annotations_file, encoding="utf-8") as fi: annotations = json.load(fi) with open(image_metadatas_file, encoding="utf-8") as fi: image_metadatas = json.load(fi) # image_metadatas = image_metadatas[: len(annotations)] # [XXX] truncate image metadatas to pass the test logger.info(f"len(image_metadatas): {len(image_metadatas)}") logger.info(f"len(annotations): {len(annotations)}") assert len(image_metadatas) == len(annotations) if split_sample_idx_list is None: split_sample_idx_list = range(len(image_metadatas)) for idx, split_idx in enumerate(split_sample_idx_list): image_metadata, annotation = ( image_metadatas[split_idx], annotations[split_idx], ) # in-place operation to normalize image_metadata _normalize_image_metadata_(image_metadata) # Normalize image_id across all annotations if "id" in annotation: # annotation["id"] corresponds to image_metadata["image_id"] assert ( image_metadata["image_id"] == annotation["id"] ), f"Annotations doesn't match with image metadataset. Got image_metadata['image_id']: {image_metadata['image_id']} and annotations['id']: {annotation['id']}" del annotation["id"] else: assert "image_id" in annotation assert ( image_metadata["image_id"] == annotation["image_id"] ), f"Annotations doesn't match with image metadataset. Got image_metadata['image_id']: {image_metadata['image_id']} and annotations['image_id']: {annotation['image_id']}" # Normalize image_id across all annotations if "image_url" in annotation: # annotation["image_url"] corresponds to image_metadata["url"] assert ( image_metadata["url"] == annotation["image_url"] ), f"Annotations doesn't match with image metadataset. Got image_metadata['url']: {image_metadata['url']} and annotations['image_url']: {annotation['image_url']}" del annotation["image_url"] elif "url" in annotation: # annotation["url"] corresponds to image_metadata["url"] assert ( image_metadata["url"] == annotation["url"] ), f"Annotations doesn't match with image metadataset. Got image_metadata['url']: {image_metadata['url']} and annotations['url']: {annotation['url']}" # in-place operation to normalize annotations annotation_normalizer_(annotation) # optionally add image to the annotation if image_folder_local_paths is not None: filepath = _get_local_image_path(image_metadata["url"], image_folder_local_paths) image_dict = {"image": filepath} else: image_dict = {} # NOTE: only get the file_name like COCO, rename url, and remove flickr_id and coco_id. image_metadata["file_name"] = _get_local_image_suffix_path(image_metadata["url"]) image_metadata["coco_url"] = image_metadata.pop("url") image_metadata.pop("flickr_id") image_metadata.pop("coco_id") yield idx, {**image_dict, **image_metadata, **annotation, "task_type": self.config.task_type}