deepspeed / src /data /data_scripts /visual_genome.py

init

002bd9b 12 months ago

26.2 kB

	# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""Visual Genome dataset."""

	import json
	import os
	import re
	from collections import defaultdict
	from typing import Any, Callable, Dict, Optional
	from urllib.parse import urlparse

	import datasets
	import dotenv


	logger = datasets.logging.get_logger(__name__)

	_CITATION = """\
	@inproceedings{krishnavisualgenome,
	title={Visual Genome: Connecting Language and Vision Using Crowdsourced Dense Image Annotations},
	author={Krishna, Ranjay and Zhu, Yuke and Groth, Oliver and Johnson, Justin and Hata, Kenji and Kravitz, Joshua and Chen, Stephanie and Kalantidis, Yannis and Li, Li-Jia and Shamma, David A and Bernstein, Michael and Fei-Fei, Li},
	year = {2016},
	url = {https://arxiv.org/abs/1602.07332},
	}
	"""

	_DESCRIPTION = """\
	Visual Genome enable to model objects and relationships between objects.
	They collect dense annotations of objects, attributes, and relationships within each image.
	Specifically, the dataset contains over 108K images where each image has an average of 35 objects, 26 attributes, and 21 pairwise relationships between objects.
	"""

	_HOMEPAGE = "https://homes.cs.washington.edu/~ranjay/visualgenome/"

	_LICENSE = "Creative Commons Attribution 4.0 International License"

	_BASE_IMAGE_URLS = {
	"https://cs.stanford.edu/people/rak248/VG_100K_2/images.zip": "VG_100K",
	"https://cs.stanford.edu/people/rak248/VG_100K_2/images2.zip": "VG_100K_2",
	}

	_LATEST_VERSIONS = {
	"mask_region_descriptions": "0.0.1",
	"region_descriptions": "1.2.0",
	"objects": "1.4.0",
	"attributes": "1.2.0",
	"relationships": "1.4.0",
	"question_answers": "1.2.0",
	"image_metadata": "1.2.0",
	}

	# ---- Features ----

	# NOTE: to be compatible with the customed COCO format.
	_BASE_IMAGE_METADATA_FEATURES = {
	"image_id": datasets.Value("int32"),
	"coco_url": datasets.Value("string"),
	"file_name": datasets.Value("string"),
	"width": datasets.Value("int32"),
	"height": datasets.Value("int32"),
	# "coco_id": datasets.Value("int64"),
	# "flickr_id": datasets.Value("int64"),
	"task_type": datasets.Value("string"),
	}

	_BASE_SYNTET_FEATURES = {
	"synset_name": datasets.Value("string"),
	"entity_name": datasets.Value("string"),
	"entity_idx_start": datasets.Value("int32"),
	"entity_idx_end": datasets.Value("int32"),
	}

	_BASE_OBJECT_FEATURES = {
	"object_id": datasets.Value("int32"),
	"x": datasets.Value("int32"),
	"y": datasets.Value("int32"),
	"w": datasets.Value("int32"),
	"h": datasets.Value("int32"),
	"names": [datasets.Value("string")],
	"synsets": [datasets.Value("string")],
	}

	_BASE_QA_OBJECT_FEATURES = {
	"object_id": datasets.Value("int32"),
	"x": datasets.Value("int32"),
	"y": datasets.Value("int32"),
	"w": datasets.Value("int32"),
	"h": datasets.Value("int32"),
	"names": [datasets.Value("string")],
	"synsets": [datasets.Value("string")],
	}

	_BASE_QA_OBJECT = {
	"qa_id": datasets.Value("int32"),
	"image_id": datasets.Value("int32"),
	"question": datasets.Value("string"),
	"answer": datasets.Value("string"),
	"a_objects": [_BASE_QA_OBJECT_FEATURES],
	"q_objects": [_BASE_QA_OBJECT_FEATURES],
	}

	_BASE_REGION_FEATURES = {
	"region_id": datasets.Value("int64"),
	"image_id": datasets.Value("int32"),
	"phrases": [datasets.Value("string")],
	"x": datasets.Value("int32"),
	"y": datasets.Value("int32"),
	"width": datasets.Value("int32"),
	"height": datasets.Value("int32"),
	}

	_BASE_MASK_FEATURES = {
	"size": [datasets.Value("int32")],
	"counts": datasets.Value("string"),
	}

	_BASE_MASK_REGION_FEATURES = {
	"region_id": datasets.Value("int64"),
	"image_id": datasets.Value("int32"),
	"phrases": [datasets.Value("string")],
	"x": datasets.Value("int32"),
	"y": datasets.Value("int32"),
	"width": datasets.Value("int32"),
	"height": datasets.Value("int32"),
	"mask": _BASE_MASK_FEATURES,
	}

	_BASE_RELATIONSHIP_FEATURES = {
	"relationship_id": datasets.Value("int32"),
	"predicate": datasets.Value("string"),
	"synsets": datasets.Value("string"),
	"subject": _BASE_OBJECT_FEATURES,
	"object": _BASE_OBJECT_FEATURES,
	}

	_NAME_VERSION_TO_ANNOTATION_FEATURES = {
	"mask_region_descriptions": {
	"0.0.1": {"regions": [_BASE_MASK_REGION_FEATURES]},
	},
	"region_descriptions": {
	"1.2.0": {"regions": [_BASE_REGION_FEATURES]},
	"1.0.0": {"regions": [_BASE_REGION_FEATURES]},
	},
	"objects": {
	"1.4.0": {
	"objects": [
	{
	**_BASE_OBJECT_FEATURES,
	"merged_object_ids": [datasets.Value("int32")],
	}
	]
	},
	"1.2.0": {"objects": [_BASE_OBJECT_FEATURES]},
	"1.0.0": {"objects": [_BASE_OBJECT_FEATURES]},
	},
	"attributes": {
	"1.2.0": {"attributes": [{**_BASE_OBJECT_FEATURES, "attributes": [datasets.Value("string")]}]},
	"1.0.0": {"attributes": [{**_BASE_OBJECT_FEATURES, "attributes": [datasets.Value("string")]}]},
	},
	"relationships": {
	"1.4.0": {
	"relationships": [
	{
	**_BASE_RELATIONSHIP_FEATURES,
	"subject": {
	**_BASE_OBJECT_FEATURES,
	"merged_object_ids": [datasets.Value("int32")],
	},
	"object": {
	**_BASE_OBJECT_FEATURES,
	"merged_object_ids": [datasets.Value("int32")],
	},
	}
	]
	},
	"1.2.0": {"relationships": [_BASE_RELATIONSHIP_FEATURES]},
	"1.0.0": {"relationships": [_BASE_RELATIONSHIP_FEATURES]},
	},
	"question_answers": {
	"1.2.0": {"qas": [_BASE_QA_OBJECT]},
	"1.0.0": {"qas": [_BASE_QA_OBJECT]},
	},
	}

	# ----- Helpers -----


	def _get_decompressed_filename_from_url(url: str) -> str:
	parsed_url = urlparse(url)
	compressed_filename = os.path.basename(parsed_url.path)

	# Remove `.zip` suffix
	assert compressed_filename.endswith(".zip")
	uncompressed_filename = compressed_filename[:-4]

	# Remove version.
	unversioned_uncompressed_filename = re.sub(r"_v[0-9]+(?:_[0-9]+)?\.json$", ".json", uncompressed_filename)

	return unversioned_uncompressed_filename


	def _get_local_image_path(img_url: str, folder_local_paths: Dict[str, str]) -> str:
	"""Obtain image folder given an image url.

	For example:
	Given `https://cs.stanford.edu/people/rak248/VG_100K_2/1.jpg` as an image url, this method returns the local path for that image.
	"""
	matches = re.fullmatch(
	r"^https://cs.stanford.edu/people/rak248/(VG_100K(?:_2)?)/([0-9]+\.jpg)$",
	img_url,
	)
	assert matches is not None, f"Got img_url: {img_url}, matched: {matches}"
	folder, filename = matches.group(1), matches.group(2)
	return os.path.join(folder_local_paths[folder], filename)


	def _get_local_image_suffix_path(img_url: str) -> str:
	"""Obtain image folder given an image url.

	For example:
	Given `https://cs.stanford.edu/people/rak248/VG_100K_2/1.jpg` as an image url, this method returns the local path for that image.
	"""
	matches = re.fullmatch(
	r"^https://cs.stanford.edu/people/rak248/(VG_100K(?:_2)?)/([0-9]+\.jpg)$",
	img_url,
	)
	assert matches is not None, f"Got img_url: {img_url}, matched: {matches}"
	folder, filename = matches.group(1), matches.group(2)
	return os.path.join(folder, filename)


	# ----- Annotation normalizers ----

	_BASE_ANNOTATION_URL = "https://homes.cs.washington.edu/~ranjay/visualgenome/data/dataset"


	def _normalize_region_description_annotation_(annotation: Dict[str, Any]) -> Dict[str, Any]:
	"""Normalizes region descriptions annotation in-place."""
	# Some attributes annotations don't have an attribute field
	for region in annotation["regions"]:
	# `id` should be converted to `region_id`:
	if "id" in region:
	region["region_id"] = region["id"]
	del region["id"]

	# `image` should be converted to `image_id`
	if "image" in region:
	region["image_id"] = region["image"]
	del region["image"]

	# NOTE(xiaoke): modify the `phrase` field to `phrases` field to be consistent with other annotations with multiple phrases
	if "phrase" in region:
	region["phrases"] = [region["phrase"]] if isinstance(region["phrase"], str) else region["phrase"]
	del region["phrase"]

	return annotation


	def _normalize_object_annotation_(annotation: Dict[str, Any]) -> Dict[str, Any]:
	"""Normalizes object annotation in-place."""
	# Some attributes annotations don't have an attribute field
	for object_ in annotation["objects"]:
	# `id` should be converted to `object_id`:
	if "id" in object_:
	object_["object_id"] = object_["id"]
	del object_["id"]

	# Some versions of `object` annotations don't have `synsets` field.
	if "synsets" not in object_:
	object_["synsets"] = None

	return annotation


	def _normalize_attribute_annotation_(annotation: Dict[str, Any]) -> Dict[str, Any]:
	"""Normalizes attributes annotation in-place."""
	# Some attributes annotations don't have an attribute field
	for attribute in annotation["attributes"]:
	# `id` should be converted to `object_id`:
	if "id" in attribute:
	attribute["object_id"] = attribute["id"]
	del attribute["id"]

	# `objects_names` should be converted to `names:
	if "object_names" in attribute:
	attribute["names"] = attribute["object_names"]
	del attribute["object_names"]

	# Some versions of `attribute` annotations don't have `synsets` field.
	if "synsets" not in attribute:
	attribute["synsets"] = None

	# Some versions of `attribute` annotations don't have `attributes` field.
	if "attributes" not in attribute:
	attribute["attributes"] = None

	return annotation


	def _normalize_relationship_annotation_(annotation: Dict[str, Any]) -> Dict[str, Any]:
	"""Normalizes relationship annotation in-place."""
	# For some reason relationships objects have a single name instead of a list of names.
	for relationship in annotation["relationships"]:
	# `id` should be converted to `object_id`:
	if "id" in relationship:
	relationship["relationship_id"] = relationship["id"]
	del relationship["id"]

	if "synsets" not in relationship:
	relationship["synsets"] = None

	subject = relationship["subject"]
	object_ = relationship["object"]

	for obj in [subject, object_]:
	# `id` should be converted to `object_id`:
	if "id" in obj:
	obj["object_id"] = obj["id"]
	del obj["id"]

	if "name" in obj:
	obj["names"] = [obj["name"]]
	del obj["name"]

	if "synsets" not in obj:
	obj["synsets"] = None

	return annotation


	def _normalize_image_metadata_(image_metadata: Dict[str, Any]) -> Dict[str, Any]:
	"""Normalizes image metadata in-place."""
	if "id" in image_metadata:
	image_metadata["image_id"] = image_metadata["id"]
	del image_metadata["id"]
	return image_metadata


	_ANNOTATION_NORMALIZER = defaultdict(lambda: lambda x: x)
	_ANNOTATION_NORMALIZER.update(
	{
	"region_descriptions": _normalize_region_description_annotation_,
	"objects": _normalize_object_annotation_,
	"attributes": _normalize_attribute_annotation_,
	"relationships": _normalize_relationship_annotation_,
	}
	)
	# No need to normalize "mask_region_descriptions",
	# since it is based on "region_descriptions",
	# which has already been normalized.

	# ---- Visual Genome loading script ----


	class VisualGenomeConfig(datasets.BuilderConfig):
	"""BuilderConfig for Visual Genome."""

	def __init__(
	self,
	name: str,
	version: Optional[str] = None,
	with_image: bool = True,
	base_image_url: Optional[str] = None,
	base_annotation_url: Optional[str] = None,
	sas_key: Optional[str] = None,
	use_densecap_splits: bool = False,
	task_type: str = "caption",
	**kwargs,
	):
	_version = _LATEST_VERSIONS[name] if version is None else version
	_name = f"{name}_v{_version}"
	super().__init__(version=datasets.Version(_version), name=_name, **kwargs)
	self._name_without_version = name
	self.annotations_features = _NAME_VERSION_TO_ANNOTATION_FEATURES[self._name_without_version][
	self.version.version_str
	]
	self.with_image = with_image

	# NOTE(xiaoke): to download files from azure
	self.base_annotation_url = base_annotation_url
	self.base_image_url = base_image_url
	self.sas_key = sas_key
	self.use_densecap_splits = use_densecap_splits
	self.task_type = task_type

	@property
	def image_zip_paths(self):
	if self.base_image_url is None:
	logger.warning("base_url is None. Using default base urls. Maybe unable to download images.")
	_image_zip_paths = _BASE_IMAGE_URLS
	else:
	if self.sas_key is None:
	sas_key = ""
	else:
	sas_key = self.sas_key
	_image_zip_paths = {
	f"{self.base_image_url}/images.zip{sas_key}": "VG_100K",
	f"{self.base_image_url}/images2.zip{sas_key}": "VG_100K_2",
	}

	logger.info(f"image_zip_paths: {_image_zip_paths}")
	return _image_zip_paths

	@property
	def annotations_url(self):
	if self.base_annotation_url is None:
	logger.warning("base_url is None. Using default base urls. Maybe unable to download annotations.")
	base_annotation_url = _BASE_ANNOTATION_URL
	sas_key = ""
	else:
	base_annotation_url = self.base_annotation_url
	if self.sas_key is None:
	sas_key = ""
	else:
	sas_key = self.sas_key

	major, minor = self.version.major, self.version.minor
	if self.version == _LATEST_VERSIONS[self._name_without_version]:
	_annotations_url = f"{base_annotation_url}/{self._name_without_version}.json.zip{sas_key}"
	elif minor == 0:
	_annotations_url = f"{base_annotation_url}/{self._name_without_version}_v{major}.json.zip{sas_key}"
	else:
	_annotations_url = f"{base_annotation_url}/{self._name_without_version}_v{major}_{minor}.json.zip{sas_key}"

	logger.info(f"annotations_url: {_annotations_url}")
	return _annotations_url

	@property
	def image_metadata_url(self):
	if self.base_annotation_url is None:
	logger.warning("base_url is None. Using default base urls. Maybe unable to download annotations.")
	base_annotation_url = _BASE_ANNOTATION_URL
	sas_key = ""
	else:
	base_annotation_url = self.base_annotation_url
	if self.sas_key is None:
	sas_key = ""
	else:
	sas_key = self.sas_key

	if not self.version == _LATEST_VERSIONS["image_metadata"]:
	logger.warning(
	f"Latest image metadata version is {_LATEST_VERSIONS['image_metadata']}. Trying to generate a dataset of version: {self.version}. Please double check that image data are unchanged between the two versions."
	)
	_image_metadata_url = f"{base_annotation_url}/image_data.json.zip{sas_key}"

	logger.info(f"image_metadata_url: {_image_metadata_url}")
	return _image_metadata_url

	@property
	def features(self):
	return datasets.Features(
	{
	**({"image": datasets.Image()} if self.with_image else {}),
	**_BASE_IMAGE_METADATA_FEATURES,
	**self.annotations_features,
	}
	)

	@property
	def densecap_splits_json_url(self):
	# NOTE: densecap_splits.json is not included in the original Visual Genome dataset.
	# We download it from "https://raw.githubusercontent.com/jcjohnson/densecap/master/info/densecap_splits.json".
	if self.base_annotation_url is None:
	logger.warning("base_url is None. Using default base urls. Maybe unable to download annotations.")
	base_annotation_url = _BASE_ANNOTATION_URL
	sas_key = ""
	else:
	base_annotation_url = self.base_annotation_url
	if self.sas_key is None:
	sas_key = ""
	else:
	sas_key = self.sas_key
	return f"{base_annotation_url}/densecap_splits.json{sas_key}"


	class VisualGenome(datasets.GeneratorBasedBuilder):
	"""Visual Genome dataset."""

	BUILDER_CONFIG_CLASS = VisualGenomeConfig
	BUILDER_CONFIGS = [
	*[VisualGenomeConfig(name="mask_region_descriptions", version=version) for version in ["0.0.1"]],
	*[VisualGenomeConfig(name="region_descriptions", version=version) for version in ["1.0.0", "1.2.0"]],
	*[VisualGenomeConfig(name="question_answers", version=version) for version in ["1.0.0", "1.2.0"]],
	*[
	VisualGenomeConfig(name="objects", version=version)
	# TODO: add support for 1.4.0
	for version in ["1.0.0", "1.2.0"]
	],
	*[VisualGenomeConfig(name="attributes", version=version) for version in ["1.0.0", "1.2.0"]],
	*[
	VisualGenomeConfig(name="relationships", version=version)
	# TODO: add support for 1.4.0
	for version in ["1.0.0", "1.2.0"]
	],
	]

	def _info(self):
	return datasets.DatasetInfo(
	description=_DESCRIPTION,
	features=self.config.features,
	homepage=_HOMEPAGE,
	license=_LICENSE,
	citation=_CITATION,
	version=self.config.version,
	)

	_SPLIT_NAME_MAP = {
	"train": "TRAIN",
	"val": "VALIDATION",
	"test": "TEST",
	}

	def _split_generators(self, dl_manager):
	self.config: VisualGenomeConfig

	# prepare sas_key
	if self.config.sas_key is None:
	# NOTE(xiaoke): load sas_key from .env
	logger.info(f"Try to load sas_key from .env file: {dotenv.load_dotenv('.env')}.")
	self.config.sas_key = os.getenv("VISUAL_GENOME_SAS_KEY")
	if self.config.sas_key is not None:
	logger.info(f"Using sas_key: {self.config.sas_key}")

	# Download image meta data.
	if dl_manager.is_streaming is True:
	raise ValueError(
	"dl_manager.is_streaming is True, which is very slow due to the random access inside zip files with streaming loading."
	)

	image_metadatas_dir = dl_manager.download_and_extract(self.config.image_metadata_url)
	image_metadatas_file = os.path.join(
	image_metadatas_dir,
	_get_decompressed_filename_from_url(self.config.image_metadata_url),
	)

	# Download annotations
	annotations_dir = dl_manager.download_and_extract(self.config.annotations_url)
	annotations_file = os.path.join(
	annotations_dir,
	_get_decompressed_filename_from_url(self.config.annotations_url),
	)

	logger.info(f"annotations_file: {annotations_file}")
	logger.info(f"image_metadatas_file: {image_metadatas_file}")
	logger.info(f"annotations_dir: {annotations_dir}")
	logger.info(f"image_metadatas_dir: {image_metadatas_dir}")

	if self.config.use_densecap_splits:
	splits_path = dl_manager.download_and_extract(self.config.densecap_splits_json_url)
	logger.info(f"densecap splits_path: {splits_path}")
	with open(splits_path, encoding="utf-8") as fi:
	_splits = json.load(fi)
	splits = {name: [] for name in _splits.keys()}
	with open(image_metadatas_file, encoding="utf-8") as fi:
	image_metadatas = json.load(fi)

	image_idx_to_sample_idx = {
	image_metadata["image_id"]: sample_idx for sample_idx, image_metadata in enumerate(image_metadatas)
	}

	splits = {}
	for name, image_id_list in _splits.items():
	splits[name] = [image_idx_to_sample_idx[image_id] for image_id in image_id_list]
	else:
	splits = dict(train=None)

	# Optionally download images
	if self.config.with_image:
	image_folder_keys = list(self.config.image_zip_paths.keys())
	image_dirs = dl_manager.download_and_extract(image_folder_keys)
	image_folder_local_paths = {
	self.config.image_zip_paths[key]: os.path.join(dir_, self.config.image_zip_paths[key])
	for key, dir_ in zip(image_folder_keys, image_dirs)
	}
	else:
	image_folder_local_paths = None

	return [
	datasets.SplitGenerator(
	name=getattr(datasets.Split, self._SPLIT_NAME_MAP[split]),
	gen_kwargs={
	"image_folder_local_paths": image_folder_local_paths,
	"image_metadatas_file": image_metadatas_file,
	"annotations_file": annotations_file,
	"annotation_normalizer_": _ANNOTATION_NORMALIZER[self.config._name_without_version],
	"split_sample_idx_list": splits[split],
	},
	)
	for split in splits
	]

	def _generate_examples(
	self,
	image_folder_local_paths: Optional[Dict[str, str]],
	image_metadatas_file: str,
	annotations_file: str,
	annotation_normalizer_: Callable[[Dict[str, Any]], Dict[str, Any]],
	split_sample_idx_list: Optional[list] = None,
	):
	with open(annotations_file, encoding="utf-8") as fi:
	annotations = json.load(fi)

	with open(image_metadatas_file, encoding="utf-8") as fi:
	image_metadatas = json.load(fi)

	# image_metadatas = image_metadatas[: len(annotations)] # [XXX] truncate image metadatas to pass the test
	logger.info(f"len(image_metadatas): {len(image_metadatas)}")
	logger.info(f"len(annotations): {len(annotations)}")

	assert len(image_metadatas) == len(annotations)

	if split_sample_idx_list is None:
	split_sample_idx_list = range(len(image_metadatas))

	for idx, split_idx in enumerate(split_sample_idx_list):
	image_metadata, annotation = (
	image_metadatas[split_idx],
	annotations[split_idx],
	)
	# in-place operation to normalize image_metadata
	_normalize_image_metadata_(image_metadata)

	# Normalize image_id across all annotations
	if "id" in annotation:
	# annotation["id"] corresponds to image_metadata["image_id"]
	assert (
	image_metadata["image_id"] == annotation["id"]
	), f"Annotations doesn't match with image metadataset. Got image_metadata['image_id']: {image_metadata['image_id']} and annotations['id']: {annotation['id']}"
	del annotation["id"]
	else:
	assert "image_id" in annotation
	assert (
	image_metadata["image_id"] == annotation["image_id"]
	), f"Annotations doesn't match with image metadataset. Got image_metadata['image_id']: {image_metadata['image_id']} and annotations['image_id']: {annotation['image_id']}"

	# Normalize image_id across all annotations
	if "image_url" in annotation:
	# annotation["image_url"] corresponds to image_metadata["url"]
	assert (
	image_metadata["url"] == annotation["image_url"]
	), f"Annotations doesn't match with image metadataset. Got image_metadata['url']: {image_metadata['url']} and annotations['image_url']: {annotation['image_url']}"
	del annotation["image_url"]
	elif "url" in annotation:
	# annotation["url"] corresponds to image_metadata["url"]
	assert (
	image_metadata["url"] == annotation["url"]
	), f"Annotations doesn't match with image metadataset. Got image_metadata['url']: {image_metadata['url']} and annotations['url']: {annotation['url']}"

	# in-place operation to normalize annotations
	annotation_normalizer_(annotation)

	# optionally add image to the annotation
	if image_folder_local_paths is not None:
	filepath = _get_local_image_path(image_metadata["url"], image_folder_local_paths)
	image_dict = {"image": filepath}
	else:
	image_dict = {}

	# NOTE: only get the file_name like COCO, rename url, and remove flickr_id and coco_id.
	image_metadata["file_name"] = _get_local_image_suffix_path(image_metadata["url"])
	image_metadata["coco_url"] = image_metadata.pop("url")
	image_metadata.pop("flickr_id")
	image_metadata.pop("coco_id")

	yield idx, {image_dict, image_metadata, **annotation, "task_type": self.config.task_type}